mirror of
https://github.com/neondatabase/neon.git
synced 2026-02-14 16:10:37 +00:00
Compare commits
1 Commits
sk-test-wa
...
vlad/reset
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
83fdc07b20 |
@@ -183,7 +183,8 @@ runs:
|
||||
|
||||
# Run the tests.
|
||||
#
|
||||
# --alluredir saves test results in Allure format (in a specified directory)
|
||||
# The junit.xml file allows CI tools to display more fine-grained test information
|
||||
# in its "Tests" tab in the results page.
|
||||
# --verbose prints name of each test (helpful when there are
|
||||
# multiple tests in one file)
|
||||
# -rA prints summary in the end
|
||||
@@ -192,6 +193,7 @@ runs:
|
||||
#
|
||||
mkdir -p $TEST_OUTPUT/allure/results
|
||||
"${cov_prefix[@]}" ./scripts/pytest \
|
||||
--junitxml=$TEST_OUTPUT/junit.xml \
|
||||
--alluredir=$TEST_OUTPUT/allure/results \
|
||||
--tb=short \
|
||||
--verbose \
|
||||
|
||||
9
.github/workflows/actionlint.yml
vendored
9
.github/workflows/actionlint.yml
vendored
@@ -36,16 +36,15 @@ jobs:
|
||||
fail_on_error: true
|
||||
filter_mode: nofilter
|
||||
level: error
|
||||
|
||||
- name: Disallow 'ubuntu-latest' runners
|
||||
run: |
|
||||
- run: |
|
||||
PAT='^\s*runs-on:.*-latest'
|
||||
if grep -ERq $PAT .github/workflows; then
|
||||
if grep -ERq $PAT .github/workflows
|
||||
then
|
||||
grep -ERl $PAT .github/workflows |\
|
||||
while read -r f
|
||||
do
|
||||
l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
|
||||
echo "::error file=$f,line=$l::Please use 'ubuntu-22.04' instead of 'ubuntu-latest'"
|
||||
echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead."
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
|
||||
17
.github/workflows/build_and_test.yml
vendored
17
.github/workflows/build_and_test.yml
vendored
@@ -1023,18 +1023,6 @@ jobs:
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
# The default value is ~/.docker
|
||||
- name: Set custom docker config directory
|
||||
run: |
|
||||
mkdir -p .docker-custom
|
||||
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
|
||||
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
|
||||
# `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
|
||||
# Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
|
||||
# Regular pageserver version string looks like
|
||||
@@ -1069,11 +1057,6 @@ jobs:
|
||||
docker compose -f ./docker-compose/docker-compose.yml logs || 0
|
||||
docker compose -f ./docker-compose/docker-compose.yml down
|
||||
|
||||
- name: Remove custom docker config directory
|
||||
if: always()
|
||||
run: |
|
||||
rm -rf .docker-custom
|
||||
|
||||
promote-images:
|
||||
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
12
.github/workflows/release.yml
vendored
12
.github/workflows/release.yml
vendored
@@ -52,15 +52,13 @@ jobs:
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
TITLE="Storage & Compute release ${RELEASE_DATE}"
|
||||
|
||||
cat << EOF > body.md
|
||||
## ${TITLE}
|
||||
## Storage & Compute release ${RELEASE_DATE}
|
||||
|
||||
**Please merge this Pull Request using 'Create a merge commit' button**
|
||||
EOF
|
||||
|
||||
gh pr create --title "${TITLE}" \
|
||||
gh pr create --title "Release ${RELEASE_DATE}" \
|
||||
--body-file "body.md" \
|
||||
--head "${RELEASE_BRANCH}" \
|
||||
--base "release"
|
||||
@@ -93,15 +91,13 @@ jobs:
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
|
||||
run: |
|
||||
TITLE="Proxy release ${RELEASE_DATE}"
|
||||
|
||||
cat << EOF > body.md
|
||||
## ${TITLE}
|
||||
## Proxy release ${RELEASE_DATE}
|
||||
|
||||
**Please merge this Pull Request using 'Create a merge commit' button**
|
||||
EOF
|
||||
|
||||
gh pr create --title "${TITLE}" \
|
||||
gh pr create --title "Proxy release ${RELEASE_DATE}" \
|
||||
--body-file "body.md" \
|
||||
--head "${RELEASE_BRANCH}" \
|
||||
--base "release-proxy"
|
||||
|
||||
@@ -36,11 +36,11 @@ use utils::pid_file::{self, PidFileRead};
|
||||
// it's waiting. If the process hasn't started/stopped after 5 seconds,
|
||||
// it prints a notice that it's taking long, but keeps waiting.
|
||||
//
|
||||
const STOP_RETRY_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
const STOP_RETRIES: u128 = STOP_RETRY_TIMEOUT.as_millis() / RETRY_INTERVAL.as_millis();
|
||||
const RETRY_INTERVAL: Duration = Duration::from_millis(100);
|
||||
const DOT_EVERY_RETRIES: u128 = 10;
|
||||
const NOTICE_AFTER_RETRIES: u128 = 50;
|
||||
const RETRY_UNTIL_SECS: u64 = 10;
|
||||
const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
|
||||
const RETRY_INTERVAL_MILLIS: u64 = 100;
|
||||
const DOT_EVERY_RETRIES: u64 = 10;
|
||||
const NOTICE_AFTER_RETRIES: u64 = 50;
|
||||
|
||||
/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
|
||||
/// it itself.
|
||||
@@ -52,7 +52,6 @@ pub enum InitialPidFile {
|
||||
}
|
||||
|
||||
/// Start a background child process using the parameters given.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn start_process<F, Fut, AI, A, EI>(
|
||||
process_name: &str,
|
||||
datadir: &Path,
|
||||
@@ -60,7 +59,6 @@ pub async fn start_process<F, Fut, AI, A, EI>(
|
||||
args: AI,
|
||||
envs: EI,
|
||||
initial_pid_file: InitialPidFile,
|
||||
retry_timeout: &Duration,
|
||||
process_status_check: F,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
@@ -71,7 +69,6 @@ where
|
||||
// Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
|
||||
EI: IntoIterator<Item = (String, String)>,
|
||||
{
|
||||
let retries: u128 = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis();
|
||||
if !datadir.metadata().context("stat datadir")?.is_dir() {
|
||||
anyhow::bail!("`datadir` must be a directory when calling this function: {datadir:?}");
|
||||
}
|
||||
@@ -133,7 +130,7 @@ where
|
||||
.unwrap();
|
||||
});
|
||||
|
||||
for retries in 0..retries {
|
||||
for retries in 0..RETRIES {
|
||||
match process_started(pid, pid_file_to_check, &process_status_check).await {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} started and passed status check, pid: {pid}");
|
||||
@@ -151,7 +148,7 @@ where
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
}
|
||||
thread::sleep(RETRY_INTERVAL);
|
||||
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
|
||||
}
|
||||
Err(e) => {
|
||||
println!("error starting process {process_name:?}: {e:#}");
|
||||
@@ -160,10 +157,9 @@ where
|
||||
}
|
||||
}
|
||||
println!();
|
||||
anyhow::bail!(format!(
|
||||
"{} did not start+pass status checks within {:?} seconds",
|
||||
process_name, retry_timeout
|
||||
));
|
||||
anyhow::bail!(
|
||||
"{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
|
||||
);
|
||||
}
|
||||
|
||||
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
|
||||
@@ -219,7 +215,7 @@ pub fn stop_process(
|
||||
}
|
||||
|
||||
pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
|
||||
for retries in 0..STOP_RETRIES {
|
||||
for retries in 0..RETRIES {
|
||||
match process_has_stopped(pid) {
|
||||
Ok(true) => {
|
||||
println!("\n{process_name} stopped");
|
||||
@@ -235,7 +231,7 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
|
||||
print!(".");
|
||||
io::stdout().flush().unwrap();
|
||||
}
|
||||
thread::sleep(RETRY_INTERVAL);
|
||||
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
|
||||
}
|
||||
Err(e) => {
|
||||
println!("{process_name} with pid {pid} failed to stop: {e:#}");
|
||||
@@ -244,10 +240,7 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
|
||||
}
|
||||
}
|
||||
println!();
|
||||
anyhow::bail!(format!(
|
||||
"{} with pid {} did not stop in {:?} seconds",
|
||||
process_name, pid, STOP_RETRY_TIMEOUT
|
||||
));
|
||||
anyhow::bail!("{process_name} with pid {pid} did not stop in {RETRY_UNTIL_SECS} seconds");
|
||||
}
|
||||
|
||||
fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {
|
||||
|
||||
@@ -36,7 +36,6 @@ use std::collections::{BTreeSet, HashMap};
|
||||
use std::path::PathBuf;
|
||||
use std::process::exit;
|
||||
use std::str::FromStr;
|
||||
use std::time::Duration;
|
||||
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
|
||||
use url::Host;
|
||||
use utils::{
|
||||
@@ -100,7 +99,7 @@ fn main() -> Result<()> {
|
||||
let subcommand_result = match sub_name {
|
||||
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
|
||||
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
|
||||
"start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))),
|
||||
"start" => rt.block_on(handle_start_all(&env)),
|
||||
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
|
||||
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
|
||||
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
|
||||
@@ -1049,20 +1048,10 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
|
||||
))
|
||||
}
|
||||
|
||||
fn get_start_timeout(args: &ArgMatches) -> &Duration {
|
||||
let humantime_duration = args
|
||||
.get_one::<humantime::Duration>("start-timeout")
|
||||
.expect("invalid value for start-timeout");
|
||||
humantime_duration.as_ref()
|
||||
}
|
||||
|
||||
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", subcommand_args)) => {
|
||||
if let Err(e) = get_pageserver(env, subcommand_args)?
|
||||
.start(get_start_timeout(subcommand_args))
|
||||
.await
|
||||
{
|
||||
if let Err(e) = get_pageserver(env, subcommand_args)?.start().await {
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
@@ -1088,7 +1077,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if let Err(e) = pageserver.start(get_start_timeout(sub_match)).await {
|
||||
if let Err(e) = pageserver.start().await {
|
||||
eprintln!("pageserver start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
@@ -1116,8 +1105,8 @@ async fn handle_storage_controller(
|
||||
) -> Result<()> {
|
||||
let svc = StorageController::from_env(env);
|
||||
match sub_match.subcommand() {
|
||||
Some(("start", start_match)) => {
|
||||
if let Err(e) = svc.start(get_start_timeout(start_match)).await {
|
||||
Some(("start", _start_match)) => {
|
||||
if let Err(e) = svc.start().await {
|
||||
eprintln!("start failed: {e}");
|
||||
exit(1);
|
||||
}
|
||||
@@ -1176,10 +1165,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
"start" => {
|
||||
let extra_opts = safekeeper_extra_opts(sub_args);
|
||||
|
||||
if let Err(e) = safekeeper
|
||||
.start(extra_opts, get_start_timeout(sub_args))
|
||||
.await
|
||||
{
|
||||
if let Err(e) = safekeeper.start(extra_opts).await {
|
||||
eprintln!("safekeeper start failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -1205,10 +1191,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
}
|
||||
|
||||
let extra_opts = safekeeper_extra_opts(sub_args);
|
||||
if let Err(e) = safekeeper
|
||||
.start(extra_opts, get_start_timeout(sub_args))
|
||||
.await
|
||||
{
|
||||
if let Err(e) = safekeeper.start(extra_opts).await {
|
||||
eprintln!("safekeeper start failed: {}", e);
|
||||
exit(1);
|
||||
}
|
||||
@@ -1221,18 +1204,15 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn handle_start_all(
|
||||
env: &local_env::LocalEnv,
|
||||
retry_timeout: &Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
// Endpoints are not started automatically
|
||||
|
||||
broker::start_broker_process(env, retry_timeout).await?;
|
||||
broker::start_broker_process(env).await?;
|
||||
|
||||
// Only start the storage controller if the pageserver is configured to need it
|
||||
if env.control_plane_api.is_some() {
|
||||
let storage_controller = StorageController::from_env(env);
|
||||
if let Err(e) = storage_controller.start(retry_timeout).await {
|
||||
if let Err(e) = storage_controller.start().await {
|
||||
eprintln!("storage_controller start failed: {:#}", e);
|
||||
try_stop_all(env, true).await;
|
||||
exit(1);
|
||||
@@ -1241,7 +1221,7 @@ async fn handle_start_all(
|
||||
|
||||
for ps_conf in &env.pageservers {
|
||||
let pageserver = PageServerNode::from_env(env, ps_conf);
|
||||
if let Err(e) = pageserver.start(retry_timeout).await {
|
||||
if let Err(e) = pageserver.start().await {
|
||||
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
|
||||
try_stop_all(env, true).await;
|
||||
exit(1);
|
||||
@@ -1250,7 +1230,7 @@ async fn handle_start_all(
|
||||
|
||||
for node in env.safekeepers.iter() {
|
||||
let safekeeper = SafekeeperNode::from_env(env, node);
|
||||
if let Err(e) = safekeeper.start(vec![], retry_timeout).await {
|
||||
if let Err(e) = safekeeper.start(vec![]).await {
|
||||
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
|
||||
try_stop_all(env, false).await;
|
||||
exit(1);
|
||||
@@ -1310,15 +1290,6 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
|
||||
}
|
||||
|
||||
fn cli() -> Command {
|
||||
let timeout_arg = Arg::new("start-timeout")
|
||||
.long("start-timeout")
|
||||
.short('t')
|
||||
.global(true)
|
||||
.help("timeout until we fail the command, e.g. 30s")
|
||||
.value_parser(value_parser!(humantime::Duration))
|
||||
.default_value("10s")
|
||||
.required(false);
|
||||
|
||||
let branch_name_arg = Arg::new("branch-name")
|
||||
.long("branch-name")
|
||||
.help("Name of the branch to be created or used as an alias for other services")
|
||||
@@ -1538,7 +1509,6 @@ fn cli() -> Command {
|
||||
.subcommand(Command::new("status"))
|
||||
.subcommand(Command::new("start")
|
||||
.about("Start local pageserver")
|
||||
.arg(timeout_arg.clone())
|
||||
)
|
||||
.subcommand(Command::new("stop")
|
||||
.about("Stop local pageserver")
|
||||
@@ -1546,15 +1516,13 @@ fn cli() -> Command {
|
||||
)
|
||||
.subcommand(Command::new("restart")
|
||||
.about("Restart local pageserver")
|
||||
.arg(timeout_arg.clone())
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("storage_controller")
|
||||
.arg_required_else_help(true)
|
||||
.about("Manage storage_controller")
|
||||
.subcommand(Command::new("start").about("Start storage controller")
|
||||
.arg(timeout_arg.clone()))
|
||||
.subcommand(Command::new("start").about("Start storage controller"))
|
||||
.subcommand(Command::new("stop").about("Stop storage controller")
|
||||
.arg(stop_mode_arg.clone()))
|
||||
)
|
||||
@@ -1566,7 +1534,6 @@ fn cli() -> Command {
|
||||
.about("Start local safekeeper")
|
||||
.arg(safekeeper_id_arg.clone())
|
||||
.arg(safekeeper_extra_opt_arg.clone())
|
||||
.arg(timeout_arg.clone())
|
||||
)
|
||||
.subcommand(Command::new("stop")
|
||||
.about("Stop local safekeeper")
|
||||
@@ -1578,7 +1545,6 @@ fn cli() -> Command {
|
||||
.arg(safekeeper_id_arg)
|
||||
.arg(stop_mode_arg.clone())
|
||||
.arg(safekeeper_extra_opt_arg)
|
||||
.arg(timeout_arg.clone())
|
||||
)
|
||||
)
|
||||
.subcommand(
|
||||
@@ -1613,7 +1579,6 @@ fn cli() -> Command {
|
||||
.arg(remote_ext_config_args)
|
||||
.arg(create_test_user)
|
||||
.arg(allow_multiple.clone())
|
||||
.arg(timeout_arg.clone())
|
||||
)
|
||||
.subcommand(Command::new("reconfigure")
|
||||
.about("Reconfigure the endpoint")
|
||||
@@ -1665,7 +1630,6 @@ fn cli() -> Command {
|
||||
.subcommand(
|
||||
Command::new("start")
|
||||
.about("Start page server and safekeepers")
|
||||
.arg(timeout_arg.clone())
|
||||
)
|
||||
.subcommand(
|
||||
Command::new("stop")
|
||||
|
||||
@@ -5,18 +5,13 @@
|
||||
//! ```text
|
||||
//! .neon/safekeepers/<safekeeper id>
|
||||
//! ```
|
||||
use std::time::Duration;
|
||||
|
||||
use anyhow::Context;
|
||||
|
||||
use camino::Utf8PathBuf;
|
||||
|
||||
use crate::{background_process, local_env};
|
||||
|
||||
pub async fn start_broker_process(
|
||||
env: &local_env::LocalEnv,
|
||||
retry_timeout: &Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
|
||||
let broker = &env.broker;
|
||||
let listen_addr = &broker.listen_addr;
|
||||
|
||||
@@ -32,7 +27,6 @@ pub async fn start_broker_process(
|
||||
args,
|
||||
[],
|
||||
background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)),
|
||||
retry_timeout,
|
||||
|| async {
|
||||
let url = broker.client_url();
|
||||
let status_url = url.join("status").with_context(|| {
|
||||
|
||||
@@ -158,8 +158,8 @@ impl PageServerNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||
self.start_node(retry_timeout).await
|
||||
pub async fn start(&self) -> anyhow::Result<()> {
|
||||
self.start_node().await
|
||||
}
|
||||
|
||||
fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
|
||||
@@ -214,15 +214,14 @@ impl PageServerNode {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||
async fn start_node(&self) -> anyhow::Result<()> {
|
||||
// TODO: using a thread here because start_process() is not async but we need to call check_status()
|
||||
let datadir = self.repo_path();
|
||||
print!(
|
||||
"Starting pageserver node {} at '{}' in {:?}, retrying for {:?}",
|
||||
"Starting pageserver node {} at '{}' in {:?}",
|
||||
self.conf.id,
|
||||
self.pg_connection_config.raw_address(),
|
||||
datadir,
|
||||
retry_timeout
|
||||
datadir
|
||||
);
|
||||
io::stdout().flush().context("flush stdout")?;
|
||||
|
||||
@@ -240,7 +239,6 @@ impl PageServerNode {
|
||||
args,
|
||||
self.pageserver_env_variables()?,
|
||||
background_process::InitialPidFile::Expect(self.pid_file()),
|
||||
retry_timeout,
|
||||
|| async {
|
||||
let st = self.check_status().await;
|
||||
match st {
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
//! ```
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
use std::time::Duration;
|
||||
use std::{io, result};
|
||||
|
||||
use anyhow::Context;
|
||||
@@ -112,16 +111,11 @@ impl SafekeeperNode {
|
||||
.expect("non-Unicode path")
|
||||
}
|
||||
|
||||
pub async fn start(
|
||||
&self,
|
||||
extra_opts: Vec<String>,
|
||||
retry_timeout: &Duration,
|
||||
) -> anyhow::Result<()> {
|
||||
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
|
||||
print!(
|
||||
"Starting safekeeper at '{}' in '{}', retrying for {:?}",
|
||||
"Starting safekeeper at '{}' in '{}'",
|
||||
self.pg_connection_config.raw_address(),
|
||||
self.datadir_path().display(),
|
||||
retry_timeout,
|
||||
self.datadir_path().display()
|
||||
);
|
||||
io::stdout().flush().unwrap();
|
||||
|
||||
@@ -206,7 +200,6 @@ impl SafekeeperNode {
|
||||
&args,
|
||||
self.safekeeper_env_variables()?,
|
||||
background_process::InitialPidFile::Expect(self.pid_file()),
|
||||
retry_timeout,
|
||||
|| async {
|
||||
match self.check_status().await {
|
||||
Ok(()) => Ok(true),
|
||||
|
||||
@@ -18,7 +18,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
|
||||
use postgres_backend::AuthType;
|
||||
use reqwest::Method;
|
||||
use serde::{de::DeserializeOwned, Deserialize, Serialize};
|
||||
use std::{fs, str::FromStr, time::Duration};
|
||||
use std::{fs, str::FromStr};
|
||||
use tokio::process::Command;
|
||||
use tracing::instrument;
|
||||
use url::Url;
|
||||
@@ -224,7 +224,7 @@ impl StorageController {
|
||||
Ok(database_url)
|
||||
}
|
||||
|
||||
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
|
||||
pub async fn start(&self) -> anyhow::Result<()> {
|
||||
// Start a vanilla Postgres process used by the storage controller for persistence.
|
||||
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
|
||||
.unwrap()
|
||||
@@ -272,7 +272,6 @@ impl StorageController {
|
||||
db_start_args,
|
||||
[],
|
||||
background_process::InitialPidFile::Create(self.postgres_pid_file()),
|
||||
retry_timeout,
|
||||
|| self.pg_isready(&pg_bin_dir),
|
||||
)
|
||||
.await?;
|
||||
@@ -327,7 +326,6 @@ impl StorageController {
|
||||
args,
|
||||
[],
|
||||
background_process::InitialPidFile::Create(self.pid_file()),
|
||||
retry_timeout,
|
||||
|| async {
|
||||
match self.ready().await {
|
||||
Ok(_) => Ok(true),
|
||||
|
||||
@@ -5,3 +5,4 @@ TODO:
|
||||
- shared across tenants
|
||||
- store pages from layer files
|
||||
- store pages from "in-memory layer"
|
||||
- store materialized pages
|
||||
|
||||
@@ -134,7 +134,7 @@ depends on that, so if you change it, bad things will happen.
|
||||
|
||||
#### page_cache_size
|
||||
|
||||
Size of the page cache. Unit is
|
||||
Size of the page cache, to hold materialized page versions. Unit is
|
||||
number of 8 kB blocks. The default is 8192, which means 64 MB.
|
||||
|
||||
#### max_file_descriptors
|
||||
|
||||
@@ -293,6 +293,22 @@ pub struct TenantCreateRequest {
|
||||
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug)]
|
||||
#[serde(deny_unknown_fields)]
|
||||
pub struct TenantLoadRequest {
|
||||
#[serde(default)]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub generation: Option<u32>,
|
||||
}
|
||||
|
||||
impl std::ops::Deref for TenantCreateRequest {
|
||||
type Target = TenantConfig;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.config
|
||||
}
|
||||
}
|
||||
|
||||
/// An alternative representation of `pageserver::tenant::TenantConf` with
|
||||
/// simpler types.
|
||||
#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
|
||||
|
||||
@@ -39,8 +39,8 @@ use crate::tenant::{
|
||||
use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
|
||||
use crate::{tenant::config::TenantConf, virtual_file};
|
||||
use crate::{
|
||||
TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME,
|
||||
TIMELINE_DELETE_MARK_SUFFIX,
|
||||
IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
|
||||
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
|
||||
};
|
||||
|
||||
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
|
||||
@@ -811,6 +811,11 @@ impl PageServerConf {
|
||||
self.tenants_path().join(tenant_shard_id.to_string())
|
||||
}
|
||||
|
||||
pub fn tenant_ignore_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
|
||||
self.tenant_path(tenant_shard_id)
|
||||
.join(IGNORED_TENANT_FILE_NAME)
|
||||
}
|
||||
|
||||
/// Points to a place in pageserver's local directory,
|
||||
/// where certain tenant's tenantconf file should be located.
|
||||
///
|
||||
|
||||
@@ -78,14 +78,29 @@ paths:
|
||||
|
||||
delete:
|
||||
description: |
|
||||
Attempts to delete specified tenant. 500, 503 and 409 errors should be retried. Deleting
|
||||
a non-existent tenant is considered successful (returns 200).
|
||||
Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
|
||||
404 means that deletion successfully finished"
|
||||
responses:
|
||||
"200":
|
||||
description: Tenant was successfully deleted, or was already not found.
|
||||
"503":
|
||||
description: Service is unavailable, or tenant is already being modified (perhaps concurrently deleted)
|
||||
|
||||
"404":
|
||||
description: Tenant not found. This is a success result, equivalent to 200.
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/NotFoundError"
|
||||
"409":
|
||||
description: Deletion is already in progress, continue polling
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
"412":
|
||||
description: Deletion may not proceed, tenant is not in Active state
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/PreconditionFailedError"
|
||||
|
||||
/v1/tenant/{tenant_id}/time_travel_remote_storage:
|
||||
parameters:
|
||||
@@ -374,6 +389,48 @@ paths:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/ConflictError"
|
||||
/v1/tenant/{tenant_id}/ignore:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
post:
|
||||
description: |
|
||||
Remove tenant data (including all corresponding timelines) from pageserver's memory.
|
||||
Files on local disk and remote storage are not affected.
|
||||
|
||||
Future pageserver restarts won't load the data back until `load` is called on such tenant.
|
||||
responses:
|
||||
"200":
|
||||
description: Tenant ignored
|
||||
|
||||
|
||||
/v1/tenant/{tenant_id}/load:
|
||||
parameters:
|
||||
- name: tenant_id
|
||||
in: path
|
||||
required: true
|
||||
schema:
|
||||
type: string
|
||||
post:
|
||||
description: |
|
||||
Schedules an operation that attempts to load a tenant from the local disk and
|
||||
synchronise it with the remote storage (if enabled), repeating pageserver's restart logic for tenant load.
|
||||
If the tenant was ignored before, removes the ignore mark and continues with load scheduling.
|
||||
|
||||
Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
|
||||
Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
|
||||
requestBody:
|
||||
required: false
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
$ref: "#/components/schemas/TenantLoadRequest"
|
||||
responses:
|
||||
"202":
|
||||
description: Tenant scheduled to load successfully
|
||||
|
||||
/v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
|
||||
parameters:
|
||||
|
||||
@@ -36,7 +36,7 @@ use pageserver_api::models::TopTenantShardsRequest;
|
||||
use pageserver_api::models::TopTenantShardsResponse;
|
||||
use pageserver_api::models::{
|
||||
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
|
||||
TenantLocationConfigRequest,
|
||||
TenantLoadRequest, TenantLocationConfigRequest,
|
||||
};
|
||||
use pageserver_api::shard::ShardCount;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
@@ -205,6 +205,7 @@ impl From<TenantSlotError> for ApiError {
|
||||
NotFound(tenant_id) => {
|
||||
ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
|
||||
}
|
||||
e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")),
|
||||
InProgress => {
|
||||
ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
|
||||
}
|
||||
@@ -334,10 +335,13 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
|
||||
use crate::tenant::delete::DeleteTenantError::*;
|
||||
match value {
|
||||
Get(g) => ApiError::from(g),
|
||||
e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
|
||||
Timeline(t) => ApiError::from(t),
|
||||
NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
|
||||
SlotError(e) => e.into(),
|
||||
SlotUpsertError(e) => e.into(),
|
||||
Other(o) => ApiError::InternalServerError(o),
|
||||
e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
|
||||
Cancelled => ApiError::ShuttingDown,
|
||||
}
|
||||
}
|
||||
@@ -887,6 +891,8 @@ async fn tenant_detach_handler(
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;
|
||||
|
||||
// This is a legacy API (`/location_conf` is the replacement). It only supports unsharded tenants
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
|
||||
@@ -894,7 +900,12 @@ async fn tenant_detach_handler(
|
||||
let conf = state.conf;
|
||||
state
|
||||
.tenant_manager
|
||||
.detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client)
|
||||
.detach_tenant(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
detach_ignored.unwrap_or(false),
|
||||
&state.deletion_queue_client,
|
||||
)
|
||||
.instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
|
||||
.await?;
|
||||
|
||||
@@ -921,6 +932,54 @@ async fn tenant_reset_handler(
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_load_handler(
|
||||
mut request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
|
||||
|
||||
let maybe_body: Option<TenantLoadRequest> = json_request_or_empty_body(&mut request).await?;
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
// The /load request is only usable when control_plane_api is not set. Once it is set, callers
|
||||
// should always use /attach instead.
|
||||
let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
|
||||
|
||||
mgr::load_tenant(
|
||||
state.conf,
|
||||
tenant_id,
|
||||
generation,
|
||||
state.broker_client.clone(),
|
||||
state.remote_storage.clone(),
|
||||
state.deletion_queue_client.clone(),
|
||||
&ctx,
|
||||
)
|
||||
.instrument(info_span!("load", %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::ACCEPTED, ())
|
||||
}
|
||||
|
||||
async fn tenant_ignore_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
) -> Result<Response<Body>, ApiError> {
|
||||
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
|
||||
check_permission(&request, Some(tenant_id))?;
|
||||
|
||||
let state = get_state(&request);
|
||||
let conf = state.conf;
|
||||
mgr::ignore_tenant(conf, tenant_id)
|
||||
.instrument(info_span!("ignore_tenant", %tenant_id))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
|
||||
async fn tenant_list_handler(
|
||||
request: Request<Body>,
|
||||
_cancel: CancellationToken,
|
||||
@@ -1012,16 +1071,23 @@ async fn tenant_delete_handler(
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
state
|
||||
let status = state
|
||||
.tenant_manager
|
||||
.delete_tenant(tenant_shard_id)
|
||||
.delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
|
||||
.instrument(info_span!("tenant_delete_handler",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug()
|
||||
))
|
||||
.await?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
// Callers use 404 as success for deletions, for historical reasons.
|
||||
if status == StatusCode::NOT_FOUND {
|
||||
return Err(ApiError::NotFound(
|
||||
anyhow::anyhow!("Deletion complete").into(),
|
||||
));
|
||||
}
|
||||
|
||||
json_response(status, ())
|
||||
}
|
||||
|
||||
/// HTTP endpoint to query the current tenant_size of a tenant.
|
||||
@@ -1441,7 +1507,7 @@ async fn put_tenant_location_config_handler(
|
||||
if let LocationConfigMode::Detached = request_data.config.mode {
|
||||
if let Err(e) = state
|
||||
.tenant_manager
|
||||
.detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client)
|
||||
.detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
|
||||
.instrument(info_span!("tenant_detach",
|
||||
tenant_id = %tenant_shard_id.tenant_id,
|
||||
shard_id = %tenant_shard_id.shard_slug()
|
||||
@@ -2698,6 +2764,12 @@ pub fn make_router(
|
||||
.post("/v1/tenant/:tenant_shard_id/reset", |r| {
|
||||
api_handler(r, tenant_reset_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/load", |r| {
|
||||
api_handler(r, tenant_load_handler)
|
||||
})
|
||||
.post("/v1/tenant/:tenant_id/ignore", |r| {
|
||||
api_handler(r, tenant_ignore_handler)
|
||||
})
|
||||
.post(
|
||||
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
|
||||
|r| api_handler(r, timeline_preserve_initdb_handler),
|
||||
|
||||
@@ -136,6 +136,13 @@ pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
|
||||
|
||||
pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
|
||||
|
||||
/// A marker file to prevent pageserver from loading a certain tenant on restart.
|
||||
/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
|
||||
/// `ignore` management API command, that expects the ignored tenant to be properly loaded
|
||||
/// into pageserver's memory before being ignored.
|
||||
/// Full path: `tenants/<tenant_id>/___ignored_tenant`.
|
||||
pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant";
|
||||
|
||||
pub fn is_temporary(path: &Utf8Path) -> bool {
|
||||
match path.file_name() {
|
||||
Some(name) => name.ends_with(TEMP_FILE_SUFFIX),
|
||||
|
||||
@@ -145,6 +145,14 @@ impl ReconstructTimeMetrics {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_materialized_cache_hits_direct_total",
|
||||
"Number of cache hits from materialized page cache without redo",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) struct ReconstructDataTimeMetrics {
|
||||
singular: Histogram,
|
||||
vectored: Histogram,
|
||||
@@ -174,6 +182,14 @@ pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> =
|
||||
}
|
||||
});
|
||||
|
||||
pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
|
||||
register_int_counter!(
|
||||
"pageserver_materialized_cache_hits_total",
|
||||
"Number of cache hits from materialized page cache",
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) struct GetVectoredLatency {
|
||||
map: EnumMap<TaskKind, Option<Histogram>>,
|
||||
}
|
||||
@@ -282,8 +298,12 @@ pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
|
||||
});
|
||||
|
||||
pub(crate) struct PageCacheMetricsForTaskKind {
|
||||
pub read_accesses_materialized_page: IntCounter,
|
||||
pub read_accesses_immutable: IntCounter,
|
||||
|
||||
pub read_hits_immutable: IntCounter,
|
||||
pub read_hits_materialized_page_exact: IntCounter,
|
||||
pub read_hits_materialized_page_older_lsn: IntCounter,
|
||||
}
|
||||
|
||||
pub(crate) struct PageCacheMetrics {
|
||||
@@ -316,6 +336,16 @@ pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMet
|
||||
let content_kind = <PageContentKind as enum_map::Enum>::from_usize(content_kind);
|
||||
let content_kind: &'static str = content_kind.into();
|
||||
PageCacheMetricsForTaskKind {
|
||||
read_accesses_materialized_page: {
|
||||
PAGE_CACHE_READ_ACCESSES
|
||||
.get_metric_with_label_values(&[
|
||||
task_kind,
|
||||
"materialized_page",
|
||||
content_kind,
|
||||
])
|
||||
.unwrap()
|
||||
},
|
||||
|
||||
read_accesses_immutable: {
|
||||
PAGE_CACHE_READ_ACCESSES
|
||||
.get_metric_with_label_values(&[task_kind, "immutable", content_kind])
|
||||
@@ -327,6 +357,28 @@ pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMet
|
||||
.get_metric_with_label_values(&[task_kind, "immutable", content_kind, "-"])
|
||||
.unwrap()
|
||||
},
|
||||
|
||||
read_hits_materialized_page_exact: {
|
||||
PAGE_CACHE_READ_HITS
|
||||
.get_metric_with_label_values(&[
|
||||
task_kind,
|
||||
"materialized_page",
|
||||
content_kind,
|
||||
"exact",
|
||||
])
|
||||
.unwrap()
|
||||
},
|
||||
|
||||
read_hits_materialized_page_older_lsn: {
|
||||
PAGE_CACHE_READ_HITS
|
||||
.get_metric_with_label_values(&[
|
||||
task_kind,
|
||||
"materialized_page",
|
||||
content_kind,
|
||||
"older_lsn",
|
||||
])
|
||||
.unwrap()
|
||||
},
|
||||
}
|
||||
}))
|
||||
})),
|
||||
@@ -342,6 +394,7 @@ pub(crate) struct PageCacheSizeMetrics {
|
||||
pub max_bytes: UIntGauge,
|
||||
|
||||
pub current_bytes_immutable: UIntGauge,
|
||||
pub current_bytes_materialized_page: UIntGauge,
|
||||
}
|
||||
|
||||
static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
@@ -367,6 +420,11 @@ pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
|
||||
.get_metric_with_label_values(&["immutable"])
|
||||
.unwrap()
|
||||
},
|
||||
current_bytes_materialized_page: {
|
||||
PAGE_CACHE_SIZE_CURRENT_BYTES
|
||||
.get_metric_with_label_values(&["materialized_page"])
|
||||
.unwrap()
|
||||
},
|
||||
});
|
||||
|
||||
pub(crate) mod page_cache_eviction_metrics {
|
||||
@@ -1347,23 +1405,17 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
|
||||
.map(|ms| (ms as f64) / 1000.0)
|
||||
});
|
||||
|
||||
pub(crate) struct BasebackupQueryTime {
|
||||
ok: Histogram,
|
||||
error: Histogram,
|
||||
}
|
||||
|
||||
pub(crate) struct BasebackupQueryTime(HistogramVec);
|
||||
pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
|
||||
let vec = register_histogram_vec!(
|
||||
"pageserver_basebackup_query_seconds",
|
||||
"Histogram of basebackup queries durations, by result type",
|
||||
&["result"],
|
||||
COMPUTE_STARTUP_BUCKETS.to_vec(),
|
||||
)
|
||||
.expect("failed to define a metric");
|
||||
BasebackupQueryTime {
|
||||
ok: vec.get_metric_with_label_values(&["ok"]).unwrap(),
|
||||
error: vec.get_metric_with_label_values(&["error"]).unwrap(),
|
||||
}
|
||||
BasebackupQueryTime({
|
||||
register_histogram_vec!(
|
||||
"pageserver_basebackup_query_seconds",
|
||||
"Histogram of basebackup queries durations, by result type",
|
||||
&["result"],
|
||||
COMPUTE_STARTUP_BUCKETS.to_vec(),
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
})
|
||||
});
|
||||
|
||||
pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
|
||||
@@ -1418,11 +1470,12 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
|
||||
elapsed
|
||||
}
|
||||
};
|
||||
let metric = if res.is_ok() {
|
||||
&self.parent.ok
|
||||
} else {
|
||||
&self.parent.error
|
||||
};
|
||||
let label_value = if res.is_ok() { "ok" } else { "error" };
|
||||
let metric = self
|
||||
.parent
|
||||
.0
|
||||
.get_metric_with_label_values(&[label_value])
|
||||
.unwrap();
|
||||
metric.observe(ex_throttled.as_secs_f64());
|
||||
}
|
||||
}
|
||||
@@ -2865,11 +2918,13 @@ pub fn preinitialize_metrics() {
|
||||
// FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
|
||||
// order:
|
||||
// - global metrics reside in a Lazy<PageserverMetrics>
|
||||
// - access via crate::metrics::PS_METRICS.some_metric.inc()
|
||||
// - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
|
||||
// - could move the statics into TimelineMetrics::new()?
|
||||
|
||||
// counters
|
||||
[
|
||||
&MATERIALIZED_PAGE_CACHE_HIT,
|
||||
&MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
|
||||
&UNEXPECTED_ONDEMAND_DOWNLOADS,
|
||||
&WALRECEIVER_STARTED_CONNECTIONS,
|
||||
&WALRECEIVER_BROKER_UPDATES,
|
||||
@@ -2931,5 +2986,4 @@ pub fn preinitialize_metrics() {
|
||||
// Custom
|
||||
Lazy::force(&RECONSTRUCT_TIME);
|
||||
Lazy::force(&tenant_throttling::TIMELINE_GET);
|
||||
Lazy::force(&BASEBACKUP_QUERY_TIME);
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
//!
|
||||
//! Two types of pages are supported:
|
||||
//!
|
||||
//! * **Materialized pages**, filled & used by page reconstruction
|
||||
//! * **Immutable File pages**, filled & used by [`crate::tenant::block_io`] and [`crate::tenant::ephemeral_file`].
|
||||
//!
|
||||
//! Note that [`crate::tenant::ephemeral_file::EphemeralFile`] is generally mutable, but, it's append-only.
|
||||
@@ -27,6 +28,9 @@
|
||||
//! Page cache maps from a cache key to a buffer slot.
|
||||
//! The cache key uniquely identifies the piece of data that is being cached.
|
||||
//!
|
||||
//! The cache key for **materialized pages** is [`TenantShardId`], [`TimelineId`], [`Key`], and [`Lsn`].
|
||||
//! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
|
||||
//!
|
||||
//! The cache key for **immutable file** pages is [`FileId`] and a block number.
|
||||
//! Users of page cache that wish to page-cache an arbitrary (immutable!) on-disk file do the following:
|
||||
//! * Have a mechanism to deterministically associate the on-disk file with a [`FileId`].
|
||||
@@ -78,10 +82,13 @@ use std::{
|
||||
|
||||
use anyhow::Context;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
|
||||
repository::Key,
|
||||
};
|
||||
|
||||
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
|
||||
@@ -132,7 +139,33 @@ pub fn next_file_id() -> FileId {
|
||||
#[derive(Debug, PartialEq, Eq, Clone)]
|
||||
#[allow(clippy::enum_variant_names)]
|
||||
enum CacheKey {
|
||||
ImmutableFilePage { file_id: FileId, blkno: u32 },
|
||||
MaterializedPage {
|
||||
hash_key: MaterializedPageHashKey,
|
||||
lsn: Lsn,
|
||||
},
|
||||
ImmutableFilePage {
|
||||
file_id: FileId,
|
||||
blkno: u32,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
|
||||
struct MaterializedPageHashKey {
|
||||
/// Why is this TenantShardId rather than TenantId?
|
||||
///
|
||||
/// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant. However, this
|
||||
/// this not the case for certain internally-generated pages (e.g. relation sizes). In future, we may make this
|
||||
/// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are
|
||||
/// special-cased in some other way.
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
key: Key,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct Version {
|
||||
lsn: Lsn,
|
||||
slot_idx: usize,
|
||||
}
|
||||
|
||||
struct Slot {
|
||||
@@ -203,6 +236,17 @@ impl SlotInner {
|
||||
}
|
||||
|
||||
pub struct PageCache {
|
||||
/// This contains the mapping from the cache key to buffer slot that currently
|
||||
/// contains the page, if any.
|
||||
///
|
||||
/// TODO: This is protected by a single lock. If that becomes a bottleneck,
|
||||
/// this HashMap can be replaced with a more concurrent version, there are
|
||||
/// plenty of such crates around.
|
||||
///
|
||||
/// If you add support for caching different kinds of objects, each object kind
|
||||
/// can have a separate mapping map, next to this field.
|
||||
materialized_page_map: std::sync::RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
|
||||
|
||||
immutable_page_map: std::sync::RwLock<HashMap<(FileId, u32), usize>>,
|
||||
|
||||
/// The actual buffers with their metadata.
|
||||
@@ -327,14 +371,175 @@ pub enum ReadBufResult<'a> {
|
||||
}
|
||||
|
||||
impl PageCache {
|
||||
//
|
||||
// Section 1.1: Public interface functions for looking up and memorizing materialized page
|
||||
// versions in the page cache
|
||||
//
|
||||
|
||||
/// Look up a materialized page version.
|
||||
///
|
||||
/// The 'lsn' is an upper bound, this will return the latest version of
|
||||
/// the given block, but not newer than 'lsn'. Returns the actual LSN of the
|
||||
/// returned page.
|
||||
pub async fn lookup_materialized_page(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
key: &Key,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Option<(Lsn, PageReadGuard)> {
|
||||
let Ok(permit) = self.try_get_pinned_slot_permit().await else {
|
||||
return None;
|
||||
};
|
||||
|
||||
crate::metrics::PAGE_CACHE
|
||||
.for_ctx(ctx)
|
||||
.read_accesses_materialized_page
|
||||
.inc();
|
||||
|
||||
let mut cache_key = CacheKey::MaterializedPage {
|
||||
hash_key: MaterializedPageHashKey {
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
key: *key,
|
||||
},
|
||||
lsn,
|
||||
};
|
||||
|
||||
if let Some(guard) = self
|
||||
.try_lock_for_read(&mut cache_key, &mut Some(permit))
|
||||
.await
|
||||
{
|
||||
if let CacheKey::MaterializedPage {
|
||||
hash_key: _,
|
||||
lsn: available_lsn,
|
||||
} = cache_key
|
||||
{
|
||||
if available_lsn == lsn {
|
||||
crate::metrics::PAGE_CACHE
|
||||
.for_ctx(ctx)
|
||||
.read_hits_materialized_page_exact
|
||||
.inc();
|
||||
} else {
|
||||
crate::metrics::PAGE_CACHE
|
||||
.for_ctx(ctx)
|
||||
.read_hits_materialized_page_older_lsn
|
||||
.inc();
|
||||
}
|
||||
Some((available_lsn, guard))
|
||||
} else {
|
||||
panic!("unexpected key type in slot");
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Store an image of the given page in the cache.
|
||||
///
|
||||
pub async fn memorize_materialized_page(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
key: Key,
|
||||
lsn: Lsn,
|
||||
img: &[u8],
|
||||
) -> anyhow::Result<()> {
|
||||
let cache_key = CacheKey::MaterializedPage {
|
||||
hash_key: MaterializedPageHashKey {
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
key,
|
||||
},
|
||||
lsn,
|
||||
};
|
||||
|
||||
let mut permit = Some(self.try_get_pinned_slot_permit().await?);
|
||||
loop {
|
||||
// First check if the key already exists in the cache.
|
||||
if let Some(slot_idx) = self.search_mapping_exact(&cache_key) {
|
||||
// The page was found in the mapping. Lock the slot, and re-check
|
||||
// that it's still what we expected (because we don't released the mapping
|
||||
// lock already, another thread could have evicted the page)
|
||||
let slot = &self.slots[slot_idx];
|
||||
let inner = slot.inner.write().await;
|
||||
if inner.key.as_ref() == Some(&cache_key) {
|
||||
slot.inc_usage_count();
|
||||
debug_assert!(
|
||||
{
|
||||
let guard = inner.permit.lock().unwrap();
|
||||
guard.upgrade().is_none()
|
||||
},
|
||||
"we hold a write lock, so, no one else should have a permit"
|
||||
);
|
||||
debug_assert_eq!(inner.buf.len(), img.len());
|
||||
// We already had it in cache. Another thread must've put it there
|
||||
// concurrently. Check that it had the same contents that we
|
||||
// replayed.
|
||||
assert!(inner.buf == img);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
debug_assert!(permit.is_some());
|
||||
|
||||
// Not found. Find a victim buffer
|
||||
let (slot_idx, mut inner) = self
|
||||
.find_victim(permit.as_ref().unwrap())
|
||||
.await
|
||||
.context("Failed to find evict victim")?;
|
||||
|
||||
// Insert mapping for this. At this point, we may find that another
|
||||
// thread did the same thing concurrently. In that case, we evicted
|
||||
// our victim buffer unnecessarily. Put it into the free list and
|
||||
// continue with the slot that the other thread chose.
|
||||
if let Some(_existing_slot_idx) = self.try_insert_mapping(&cache_key, slot_idx) {
|
||||
// TODO: put to free list
|
||||
|
||||
// We now just loop back to start from beginning. This is not
|
||||
// optimal, we'll perform the lookup in the mapping again, which
|
||||
// is not really necessary because we already got
|
||||
// 'existing_slot_idx'. But this shouldn't happen often enough
|
||||
// to matter much.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Make the slot ready
|
||||
let slot = &self.slots[slot_idx];
|
||||
inner.key = Some(cache_key.clone());
|
||||
slot.set_usage_count(1);
|
||||
// Create a write guard for the slot so we go through the expected motions.
|
||||
debug_assert!(
|
||||
{
|
||||
let guard = inner.permit.lock().unwrap();
|
||||
guard.upgrade().is_none()
|
||||
},
|
||||
"we hold a write lock, so, no one else should have a permit"
|
||||
);
|
||||
let mut write_guard = PageWriteGuard {
|
||||
state: PageWriteGuardState::Invalid {
|
||||
_permit: permit.take().unwrap(),
|
||||
inner,
|
||||
},
|
||||
};
|
||||
write_guard.copy_from_slice(img);
|
||||
let _ = write_guard.mark_valid();
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
// Section 1.2: Public interface functions for working with immutable file pages.
|
||||
|
||||
pub async fn read_immutable_buf(
|
||||
&self,
|
||||
file_id: FileId,
|
||||
blkno: u32,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ReadBufResult> {
|
||||
self.lock_for_read(&(CacheKey::ImmutableFilePage { file_id, blkno }), ctx)
|
||||
.await
|
||||
let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };
|
||||
|
||||
self.lock_for_read(&mut cache_key, ctx).await
|
||||
}
|
||||
|
||||
//
|
||||
@@ -368,11 +573,19 @@ impl PageCache {
|
||||
|
||||
/// Look up a page in the cache.
|
||||
///
|
||||
/// If the search criteria is not exact, *cache_key is updated with the key
|
||||
/// for exact key of the returned page. (For materialized pages, that means
|
||||
/// that the LSN in 'cache_key' is updated with the LSN of the returned page
|
||||
/// version.)
|
||||
///
|
||||
/// If no page is found, returns None and *cache_key is left unmodified.
|
||||
///
|
||||
async fn try_lock_for_read(
|
||||
&self,
|
||||
cache_key: &CacheKey,
|
||||
cache_key: &mut CacheKey,
|
||||
permit: &mut Option<PinnedSlotsPermit>,
|
||||
) -> Option<PageReadGuard> {
|
||||
let cache_key_orig = cache_key.clone();
|
||||
if let Some(slot_idx) = self.search_mapping(cache_key) {
|
||||
// The page was found in the mapping. Lock the slot, and re-check
|
||||
// that it's still what we expected (because we released the mapping
|
||||
@@ -385,6 +598,9 @@ impl PageCache {
|
||||
_permit: inner.coalesce_readers_permit(permit.take().unwrap()),
|
||||
slot_guard: inner,
|
||||
});
|
||||
} else {
|
||||
// search_mapping might have modified the search key; restore it.
|
||||
*cache_key = cache_key_orig;
|
||||
}
|
||||
}
|
||||
None
|
||||
@@ -421,12 +637,15 @@ impl PageCache {
|
||||
///
|
||||
async fn lock_for_read(
|
||||
&self,
|
||||
cache_key: &CacheKey,
|
||||
cache_key: &mut CacheKey,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<ReadBufResult> {
|
||||
let mut permit = Some(self.try_get_pinned_slot_permit().await?);
|
||||
|
||||
let (read_access, hit) = match cache_key {
|
||||
CacheKey::MaterializedPage { .. } => {
|
||||
unreachable!("Materialized pages use lookup_materialized_page")
|
||||
}
|
||||
CacheKey::ImmutableFilePage { .. } => (
|
||||
&crate::metrics::PAGE_CACHE
|
||||
.for_ctx(ctx)
|
||||
@@ -498,15 +717,52 @@ impl PageCache {
|
||||
|
||||
/// Search for a page in the cache using the given search key.
|
||||
///
|
||||
/// Returns the slot index, if any.
|
||||
/// Returns the slot index, if any. If the search criteria is not exact,
|
||||
/// *cache_key is updated with the actual key of the found page.
|
||||
///
|
||||
/// NOTE: We don't hold any lock on the mapping on return, so the slot might
|
||||
/// get recycled for an unrelated page immediately after this function
|
||||
/// returns. The caller is responsible for re-checking that the slot still
|
||||
/// contains the page with the same key before using it.
|
||||
///
|
||||
fn search_mapping(&self, cache_key: &CacheKey) -> Option<usize> {
|
||||
fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
|
||||
match cache_key {
|
||||
CacheKey::MaterializedPage { hash_key, lsn } => {
|
||||
let map = self.materialized_page_map.read().unwrap();
|
||||
let versions = map.get(hash_key)?;
|
||||
|
||||
let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) {
|
||||
Ok(version_idx) => version_idx,
|
||||
Err(0) => return None,
|
||||
Err(version_idx) => version_idx - 1,
|
||||
};
|
||||
let version = &versions[version_idx];
|
||||
*lsn = version.lsn;
|
||||
Some(version.slot_idx)
|
||||
}
|
||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||
let map = self.immutable_page_map.read().unwrap();
|
||||
Some(*map.get(&(*file_id, *blkno))?)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Search for a page in the cache using the given search key.
|
||||
///
|
||||
/// Like 'search_mapping, but performs an "exact" search. Used for
|
||||
/// allocating a new buffer.
|
||||
fn search_mapping_exact(&self, key: &CacheKey) -> Option<usize> {
|
||||
match key {
|
||||
CacheKey::MaterializedPage { hash_key, lsn } => {
|
||||
let map = self.materialized_page_map.read().unwrap();
|
||||
let versions = map.get(hash_key)?;
|
||||
|
||||
if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) {
|
||||
Some(versions[version_idx].slot_idx)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||
let map = self.immutable_page_map.read().unwrap();
|
||||
Some(*map.get(&(*file_id, *blkno))?)
|
||||
@@ -519,6 +775,27 @@ impl PageCache {
|
||||
///
|
||||
fn remove_mapping(&self, old_key: &CacheKey) {
|
||||
match old_key {
|
||||
CacheKey::MaterializedPage {
|
||||
hash_key: old_hash_key,
|
||||
lsn: old_lsn,
|
||||
} => {
|
||||
let mut map = self.materialized_page_map.write().unwrap();
|
||||
if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) {
|
||||
let versions = old_entry.get_mut();
|
||||
|
||||
if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
|
||||
versions.remove(version_idx);
|
||||
self.size_metrics
|
||||
.current_bytes_materialized_page
|
||||
.sub_page_sz(1);
|
||||
if versions.is_empty() {
|
||||
old_entry.remove_entry();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
panic!("could not find old key in mapping")
|
||||
}
|
||||
}
|
||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||
let mut map = self.immutable_page_map.write().unwrap();
|
||||
map.remove(&(*file_id, *blkno))
|
||||
@@ -535,6 +812,30 @@ impl PageCache {
|
||||
/// of the existing mapping and leaves it untouched.
|
||||
fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
|
||||
match new_key {
|
||||
CacheKey::MaterializedPage {
|
||||
hash_key: new_key,
|
||||
lsn: new_lsn,
|
||||
} => {
|
||||
let mut map = self.materialized_page_map.write().unwrap();
|
||||
let versions = map.entry(new_key.clone()).or_default();
|
||||
match versions.binary_search_by_key(new_lsn, |v| v.lsn) {
|
||||
Ok(version_idx) => Some(versions[version_idx].slot_idx),
|
||||
Err(version_idx) => {
|
||||
versions.insert(
|
||||
version_idx,
|
||||
Version {
|
||||
lsn: *new_lsn,
|
||||
slot_idx,
|
||||
},
|
||||
);
|
||||
self.size_metrics
|
||||
.current_bytes_materialized_page
|
||||
.add_page_sz(1);
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CacheKey::ImmutableFilePage { file_id, blkno } => {
|
||||
let mut map = self.immutable_page_map.write().unwrap();
|
||||
match map.entry((*file_id, *blkno)) {
|
||||
@@ -648,6 +949,7 @@ impl PageCache {
|
||||
let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
|
||||
size_metrics.max_bytes.set_page_sz(num_pages);
|
||||
size_metrics.current_bytes_immutable.set_page_sz(0);
|
||||
size_metrics.current_bytes_materialized_page.set_page_sz(0);
|
||||
|
||||
let slots = page_buffer
|
||||
.chunks_exact_mut(PAGE_SZ)
|
||||
@@ -666,6 +968,7 @@ impl PageCache {
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
materialized_page_map: Default::default(),
|
||||
immutable_page_map: Default::default(),
|
||||
slots,
|
||||
next_evict_slot: AtomicUsize::new(0),
|
||||
|
||||
@@ -6,23 +6,25 @@ use pageserver_api::{models::TenantState, shard::TenantShardId};
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, Instrument};
|
||||
use tracing::{error, instrument, Instrument};
|
||||
|
||||
use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
context::RequestContext,
|
||||
task_mgr::{self},
|
||||
task_mgr::{self, TaskKind},
|
||||
tenant::{
|
||||
mgr::{TenantSlot, TenantsMapRemoveResult},
|
||||
remote_timeline_client::remote_heatmap_path,
|
||||
timeline::ShutdownMode,
|
||||
},
|
||||
};
|
||||
|
||||
use super::{
|
||||
mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
|
||||
remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
|
||||
span,
|
||||
timeline::delete::DeleteTimelineFlow,
|
||||
tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
|
||||
};
|
||||
@@ -32,6 +34,15 @@ pub(crate) enum DeleteTenantError {
|
||||
#[error("GetTenant {0}")]
|
||||
Get(#[from] GetTenantError),
|
||||
|
||||
#[error("Tenant not attached")]
|
||||
NotAttached,
|
||||
|
||||
#[error("Invalid state {0}. Expected Active or Broken")]
|
||||
InvalidState(TenantState),
|
||||
|
||||
#[error("Tenant deletion is already in progress")]
|
||||
AlreadyInProgress,
|
||||
|
||||
#[error("Tenant map slot error {0}")]
|
||||
SlotError(#[from] TenantSlotError),
|
||||
|
||||
@@ -63,6 +74,56 @@ fn remote_tenant_delete_mark_path(
|
||||
Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
|
||||
}
|
||||
|
||||
async fn create_remote_delete_mark(
|
||||
conf: &PageServerConf,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
|
||||
|
||||
let data: &[u8] = &[];
|
||||
backoff::retry(
|
||||
|| async {
|
||||
let data = bytes::Bytes::from_static(data);
|
||||
let stream = futures::stream::once(futures::future::ready(Ok(data)));
|
||||
remote_storage
|
||||
.upload(stream, 0, &remote_mark_path, None, cancel)
|
||||
.await
|
||||
},
|
||||
TimeoutOrCancel::caused_by_cancel,
|
||||
FAILED_UPLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES,
|
||||
"mark_upload",
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
|
||||
.and_then(|x| x)
|
||||
.context("mark_upload")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn create_local_delete_mark(
|
||||
conf: &PageServerConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id);
|
||||
|
||||
// Note: we're ok to replace existing file.
|
||||
let _ = std::fs::OpenOptions::new()
|
||||
.write(true)
|
||||
.create(true)
|
||||
.truncate(true)
|
||||
.open(&marker_path)
|
||||
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
|
||||
|
||||
crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn schedule_ordered_timeline_deletions(
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
|
||||
@@ -201,6 +262,21 @@ async fn cleanup_remaining_fs_traces(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
|
||||
/// and deletes its data from both disk and s3.
|
||||
/// The sequence of steps:
|
||||
/// 1. Upload remote deletion mark.
|
||||
/// 2. Create local mark file.
|
||||
/// 3. Shutdown tasks
|
||||
/// 4. Run ordered timeline deletions
|
||||
/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
|
||||
/// 6. Remove remote mark
|
||||
/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
|
||||
/// It is resumable from any step in case a crash/restart occurs.
|
||||
/// There are two entrypoints to the process:
|
||||
/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
|
||||
/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
|
||||
/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
|
||||
#[derive(Default)]
|
||||
pub enum DeleteTenantFlow {
|
||||
#[default]
|
||||
@@ -210,6 +286,91 @@ pub enum DeleteTenantFlow {
|
||||
}
|
||||
|
||||
impl DeleteTenantFlow {
|
||||
// These steps are run in the context of management api request handler.
|
||||
// Long running steps are continued to run in the background.
|
||||
// NB: If this fails half-way through, and is retried, the retry will go through
|
||||
// all the same steps again. Make sure the code here is idempotent, and don't
|
||||
// error out if some of the shutdown tasks have already been completed!
|
||||
// NOTE: static needed for background part.
|
||||
// We assume that calling code sets up the span with tenant_id.
|
||||
#[instrument(skip_all)]
|
||||
pub(crate) async fn run(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: Arc<Tenant>,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
pausable_failpoint!("tenant-delete-before-run");
|
||||
|
||||
let mut guard = Self::prepare(&tenant).await?;
|
||||
|
||||
if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await {
|
||||
tenant.set_broken(format!("{e:#}")).await;
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Helper function needed to be able to match once on returned error and transition tenant into broken state.
|
||||
// This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
|
||||
// will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
|
||||
// So the solution is to set tenant state to broken.
|
||||
async fn run_inner(
|
||||
guard: &mut OwnedMutexGuard<Self>,
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
tenant: &Tenant,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-create-remote-mark"
|
||||
))?
|
||||
});
|
||||
|
||||
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
|
||||
.await
|
||||
.context("remote_mark")?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-create-local-mark"
|
||||
))?
|
||||
});
|
||||
|
||||
create_local_delete_mark(conf, &tenant.tenant_shard_id)
|
||||
.await
|
||||
.context("local delete mark")?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-background", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-background"
|
||||
))?
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn mark_in_progress(&mut self) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::Finished => anyhow::bail!("Bug. Is in finished state"),
|
||||
Self::InProgress { .. } => { /* We're in a retry */ }
|
||||
Self::NotStarted => { /* Fresh start */ }
|
||||
}
|
||||
|
||||
*self = Self::InProgress;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn should_resume_deletion(
|
||||
conf: &'static PageServerConf,
|
||||
remote_mark_exists: bool,
|
||||
@@ -267,6 +428,79 @@ impl DeleteTenantFlow {
|
||||
.await
|
||||
}
|
||||
|
||||
/// Check whether background deletion of this tenant is currently in progress
|
||||
pub(crate) fn is_in_progress(tenant: &Tenant) -> bool {
|
||||
tenant.delete_progress.try_lock().is_err()
|
||||
}
|
||||
|
||||
async fn prepare(
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
|
||||
// FIXME: unsure about active only. Our init jobs may not be cancellable properly,
|
||||
// so at least for now allow deletions only for active tenants. TODO recheck
|
||||
// Broken and Stopping is needed for retries.
|
||||
if !matches!(
|
||||
tenant.current_state(),
|
||||
TenantState::Active | TenantState::Broken { .. }
|
||||
) {
|
||||
return Err(DeleteTenantError::InvalidState(tenant.current_state()));
|
||||
}
|
||||
|
||||
let guard = Arc::clone(&tenant.delete_progress)
|
||||
.try_lock_owned()
|
||||
.map_err(|_| DeleteTenantError::AlreadyInProgress)?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-shutdown", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
|
||||
});
|
||||
|
||||
// make pageserver shutdown not to wait for our completion
|
||||
let (_, progress) = completion::channel();
|
||||
|
||||
// It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
|
||||
// i e it is an error to do:
|
||||
// tenant.set_stopping
|
||||
// tenant.shutdown
|
||||
// Its also bad that we're holding tenants.read here.
|
||||
// TODO relax set_stopping to be idempotent?
|
||||
if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"tenant shutdown is already in progress"
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(guard)
|
||||
}
|
||||
|
||||
fn schedule_background(
|
||||
guard: OwnedMutexGuard<Self>,
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: Arc<Tenant>,
|
||||
) {
|
||||
let tenant_shard_id = tenant.tenant_shard_id;
|
||||
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::TimelineDeletionWorker,
|
||||
Some(tenant_shard_id),
|
||||
None,
|
||||
"tenant_delete",
|
||||
false,
|
||||
async move {
|
||||
if let Err(err) =
|
||||
Self::background(guard, conf, remote_storage, tenants, &tenant).await
|
||||
{
|
||||
error!("Error: {err:#}");
|
||||
tenant.set_broken(format!("{err:#}")).await;
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
.instrument(tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
|
||||
);
|
||||
}
|
||||
|
||||
async fn background(
|
||||
mut guard: OwnedMutexGuard<Self>,
|
||||
conf: &PageServerConf,
|
||||
@@ -346,6 +580,8 @@ impl DeleteTenantFlow {
|
||||
.context("cleanup_remaining_fs_traces")?;
|
||||
|
||||
{
|
||||
pausable_failpoint!("tenant-delete-before-map-remove");
|
||||
|
||||
// This block is simply removing the TenantSlot for this tenant. It requires a loop because
|
||||
// we might conflict with a TenantSlot::InProgress marker and need to wait for it.
|
||||
//
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
|
||||
use futures::StreamExt;
|
||||
use hyper::StatusCode;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::models::LocationConfigMode;
|
||||
@@ -26,7 +27,8 @@ use tokio::task::JoinSet;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::*;
|
||||
|
||||
use utils::{backoff, completion, crashsafe};
|
||||
use remote_storage::GenericRemoteStorage;
|
||||
use utils::{completion, crashsafe};
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{DownloadBehavior, RequestContext};
|
||||
@@ -40,11 +42,12 @@ use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::{
|
||||
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
|
||||
};
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
|
||||
use crate::tenant::storage_layer::inmemory_layer;
|
||||
use crate::tenant::timeline::ShutdownMode;
|
||||
use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
|
||||
use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
|
||||
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
|
||||
|
||||
use utils::crashsafe::path_with_suffix_extension;
|
||||
use utils::fs_ext::PathExt;
|
||||
@@ -419,6 +422,12 @@ fn load_tenant_config(
|
||||
}
|
||||
};
|
||||
|
||||
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
|
||||
if tenant_ignore_mark_file.exists() {
|
||||
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Ok(Some((
|
||||
tenant_shard_id,
|
||||
Tenant::load_tenant_config(conf, &tenant_shard_id),
|
||||
@@ -704,6 +713,12 @@ fn tenant_spawn(
|
||||
"Cannot load tenant from empty directory {tenant_path:?}"
|
||||
);
|
||||
|
||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
|
||||
anyhow::ensure!(
|
||||
!conf.tenant_ignore_mark_file_path(&tenant_shard_id).exists(),
|
||||
"Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
|
||||
);
|
||||
|
||||
let remote_storage = resources.remote_storage.clone();
|
||||
let tenant = match Tenant::spawn(
|
||||
conf,
|
||||
@@ -1052,7 +1067,7 @@ impl TenantManager {
|
||||
// not do significant I/O, and shutdowns should be prompt via cancellation tokens.
|
||||
let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)
|
||||
.map_err(|e| match e {
|
||||
TenantSlotError::NotFound(_) => {
|
||||
TenantSlotError::AlreadyExists(_, _) | TenantSlotError::NotFound(_) => {
|
||||
unreachable!("Called with mode Any")
|
||||
}
|
||||
TenantSlotError::InProgress => UpsertLocationError::InProgress,
|
||||
@@ -1352,10 +1367,56 @@ impl TenantManager {
|
||||
}
|
||||
}
|
||||
|
||||
async fn delete_tenant_remote(
|
||||
pub(crate) async fn delete_tenant(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
activation_timeout: Duration,
|
||||
) -> Result<StatusCode, DeleteTenantError> {
|
||||
super::span::debug_assert_current_span_has_tenant_id();
|
||||
// We acquire a SlotGuard during this function to protect against concurrent
|
||||
// changes while the ::prepare phase of DeleteTenantFlow executes, but then
|
||||
// have to return the Tenant to the map while the background deletion runs.
|
||||
//
|
||||
// TODO: refactor deletion to happen outside the lifetime of a Tenant.
|
||||
// Currently, deletion requires a reference to the tenants map in order to
|
||||
// keep the Tenant in the map until deletion is complete, and then remove
|
||||
// it at the end.
|
||||
//
|
||||
// See https://github.com/neondatabase/neon/issues/5080
|
||||
|
||||
// Tenant deletion can happen two ways:
|
||||
// - Legacy: called on an attached location. The attached Tenant object stays alive in Stopping
|
||||
// state until deletion is complete.
|
||||
// - New: called on a pageserver without an attached location. We proceed with deletion from
|
||||
// remote storage.
|
||||
//
|
||||
// See https://github.com/neondatabase/neon/issues/5080 for more context on this transition.
|
||||
|
||||
let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
||||
match &slot_guard.old_value {
|
||||
Some(TenantSlot::Attached(tenant)) => {
|
||||
// Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
|
||||
// deletion will be resumed across restarts.
|
||||
let tenant = tenant.clone();
|
||||
return self
|
||||
.delete_tenant_attached(slot_guard, tenant, activation_timeout)
|
||||
.await;
|
||||
}
|
||||
Some(TenantSlot::Secondary(secondary_tenant)) => {
|
||||
secondary_tenant.shutdown().await;
|
||||
let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
|
||||
let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("local tenant directory {local_tenant_directory:?} rename")
|
||||
})?;
|
||||
spawn_background_purge(tmp_dir);
|
||||
}
|
||||
Some(TenantSlot::InProgress(_)) => unreachable!(),
|
||||
None => {}
|
||||
};
|
||||
|
||||
// Fall through: local state for this tenant is no longer present, proceed with remote delete
|
||||
let remote_path = remote_tenant_path(&tenant_shard_id);
|
||||
let keys = match self
|
||||
.resources
|
||||
@@ -1372,7 +1433,7 @@ impl TenantManager {
|
||||
Err(remote_storage::DownloadError::Cancelled) => {
|
||||
return Err(DeleteTenantError::Cancelled)
|
||||
}
|
||||
Err(remote_storage::DownloadError::NotFound) => return Ok(()),
|
||||
Err(remote_storage::DownloadError::NotFound) => return Ok(StatusCode::NOT_FOUND),
|
||||
Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
|
||||
};
|
||||
|
||||
@@ -1386,83 +1447,60 @@ impl TenantManager {
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
// Callers use 404 as success for deletions, for historical reasons.
|
||||
Ok(StatusCode::NOT_FOUND)
|
||||
}
|
||||
|
||||
/// If a tenant is attached, detach it. Then remove its data from remote storage.
|
||||
///
|
||||
/// A tenant is considered deleted once it is gone from remote storage. It is the caller's
|
||||
/// responsibility to avoid trying to attach the tenant again or use it any way once deletion
|
||||
/// has started: this operation is not atomic, and must be retried until it succeeds.
|
||||
pub(crate) async fn delete_tenant(
|
||||
async fn delete_tenant_attached(
|
||||
&self,
|
||||
tenant_shard_id: TenantShardId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
super::span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
async fn delete_local(
|
||||
conf: &PageServerConf,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
) -> anyhow::Result<()> {
|
||||
let local_tenant_directory = conf.tenant_path(tenant_shard_id);
|
||||
let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("local tenant directory {local_tenant_directory:?} rename")
|
||||
})?;
|
||||
spawn_background_purge(tmp_dir);
|
||||
Ok(())
|
||||
slot_guard: SlotGuard,
|
||||
tenant: Arc<Tenant>,
|
||||
activation_timeout: Duration,
|
||||
) -> Result<StatusCode, DeleteTenantError> {
|
||||
match tenant.current_state() {
|
||||
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
|
||||
// If deletion is already in progress, return success (the semantics of this
|
||||
// function are to rerturn success afterr deletion is spawned in background).
|
||||
// Otherwise fall through and let [`DeleteTenantFlow`] handle this state.
|
||||
if DeleteTenantFlow::is_in_progress(&tenant) {
|
||||
// The `delete_progress` lock is held: deletion is already happening
|
||||
// in the bacckground
|
||||
slot_guard.revert();
|
||||
return Ok(StatusCode::ACCEPTED);
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
tenant
|
||||
.wait_to_become_active(activation_timeout)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
GetActiveTenantError::WillNotBecomeActive(_)
|
||||
| GetActiveTenantError::Broken(_) => {
|
||||
DeleteTenantError::InvalidState(tenant.current_state())
|
||||
}
|
||||
GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
|
||||
GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached,
|
||||
GetActiveTenantError::WaitForActiveTimeout {
|
||||
latest_state: _latest_state,
|
||||
wait_time: _wait_time,
|
||||
} => DeleteTenantError::InvalidState(tenant.current_state()),
|
||||
})?;
|
||||
}
|
||||
}
|
||||
|
||||
let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
|
||||
match &slot_guard.old_value {
|
||||
Some(TenantSlot::Attached(tenant)) => {
|
||||
// Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
|
||||
// deletion will be resumed across restarts.
|
||||
let tenant = tenant.clone();
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||
Ok(()) => {}
|
||||
Err(barrier) => {
|
||||
info!("Shutdown already in progress, waiting for it to complete");
|
||||
barrier.wait().await;
|
||||
}
|
||||
}
|
||||
delete_local(self.conf, &tenant_shard_id).await?;
|
||||
}
|
||||
Some(TenantSlot::Secondary(secondary_tenant)) => {
|
||||
secondary_tenant.shutdown().await;
|
||||
|
||||
delete_local(self.conf, &tenant_shard_id).await?;
|
||||
}
|
||||
Some(TenantSlot::InProgress(_)) => unreachable!(),
|
||||
None => {}
|
||||
};
|
||||
|
||||
// Fall through: local state for this tenant is no longer present, proceed with remote delete.
|
||||
// - We use a retry wrapper here so that common transient S3 errors (e.g. 503, 429) do not result
|
||||
// in 500 responses to delete requests.
|
||||
// - We keep the `SlotGuard` during this I/O, so that if a concurrent delete request comes in, it will
|
||||
// 503/retry, rather than kicking off a wasteful concurrent deletion.
|
||||
match backoff::retry(
|
||||
|| async move { self.delete_tenant_remote(tenant_shard_id).await },
|
||||
|e| match e {
|
||||
DeleteTenantError::Cancelled => true,
|
||||
DeleteTenantError::SlotError(_) => {
|
||||
unreachable!("Remote deletion doesn't touch slots")
|
||||
}
|
||||
_ => false,
|
||||
},
|
||||
1,
|
||||
3,
|
||||
&format!("delete_tenant[tenant_shard_id={tenant_shard_id}]"),
|
||||
let result = DeleteTenantFlow::run(
|
||||
self.conf,
|
||||
self.resources.remote_storage.clone(),
|
||||
&TENANTS,
|
||||
tenant,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Some(r) => r,
|
||||
None => Err(DeleteTenantError::Cancelled),
|
||||
}
|
||||
.await;
|
||||
|
||||
// The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
|
||||
slot_guard.revert();
|
||||
let () = result?;
|
||||
Ok(StatusCode::ACCEPTED)
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
|
||||
@@ -1863,10 +1901,17 @@ impl TenantManager {
|
||||
&self,
|
||||
conf: &'static PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
detach_ignored: bool,
|
||||
deletion_queue_client: &DeletionQueueClient,
|
||||
) -> Result<(), TenantStateError> {
|
||||
let tmp_path = self
|
||||
.detach_tenant0(conf, &TENANTS, tenant_shard_id, deletion_queue_client)
|
||||
.detach_tenant0(
|
||||
conf,
|
||||
&TENANTS,
|
||||
tenant_shard_id,
|
||||
detach_ignored,
|
||||
deletion_queue_client,
|
||||
)
|
||||
.await?;
|
||||
spawn_background_purge(tmp_path);
|
||||
|
||||
@@ -1878,6 +1923,7 @@ impl TenantManager {
|
||||
conf: &'static PageServerConf,
|
||||
tenants: &std::sync::RwLock<TenantsMap>,
|
||||
tenant_shard_id: TenantShardId,
|
||||
detach_ignored: bool,
|
||||
deletion_queue_client: &DeletionQueueClient,
|
||||
) -> Result<Utf8PathBuf, TenantStateError> {
|
||||
let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
|
||||
@@ -1900,6 +1946,26 @@ impl TenantManager {
|
||||
// before this tenant is potentially re-attached elsewhere.
|
||||
deletion_queue_client.flush_advisory();
|
||||
|
||||
// Ignored tenants are not present in memory and will bail the removal from memory operation.
|
||||
// Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
|
||||
if detach_ignored
|
||||
&& matches!(
|
||||
removal_result,
|
||||
Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
|
||||
)
|
||||
{
|
||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
|
||||
if tenant_ignore_mark.exists() {
|
||||
info!("Detaching an ignored tenant");
|
||||
let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!("Ignored tenant {tenant_shard_id} local directory rename")
|
||||
})?;
|
||||
return Ok(tmp_path);
|
||||
}
|
||||
}
|
||||
|
||||
removal_result
|
||||
}
|
||||
|
||||
@@ -2156,6 +2222,97 @@ pub(crate) enum TenantStateError {
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
pub(crate) async fn load_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
generation: Generation,
|
||||
broker_client: storage_broker::BrokerClientChannel,
|
||||
remote_storage: GenericRemoteStorage,
|
||||
deletion_queue_client: DeletionQueueClient,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), TenantMapInsertError> {
|
||||
// This is a legacy API (replaced by `/location_conf`). It does not support sharding
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
|
||||
let slot_guard =
|
||||
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
|
||||
let tenant_path = conf.tenant_path(&tenant_shard_id);
|
||||
|
||||
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
|
||||
if tenant_ignore_mark.exists() {
|
||||
std::fs::remove_file(&tenant_ignore_mark).with_context(|| {
|
||||
format!(
|
||||
"Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"
|
||||
)
|
||||
})?;
|
||||
}
|
||||
|
||||
let resources = TenantSharedResources {
|
||||
broker_client,
|
||||
remote_storage,
|
||||
deletion_queue_client,
|
||||
};
|
||||
|
||||
let mut location_conf =
|
||||
Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?;
|
||||
location_conf.attach_in_generation(AttachmentMode::Single, generation);
|
||||
|
||||
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
|
||||
|
||||
let shard_identity = location_conf.shard;
|
||||
let new_tenant = tenant_spawn(
|
||||
conf,
|
||||
tenant_shard_id,
|
||||
&tenant_path,
|
||||
resources,
|
||||
AttachedTenantConf::try_from(location_conf)?,
|
||||
shard_identity,
|
||||
None,
|
||||
&TENANTS,
|
||||
SpawnMode::Eager,
|
||||
ctx,
|
||||
)
|
||||
.with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?;
|
||||
|
||||
slot_guard.upsert(TenantSlot::Attached(new_tenant))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn ignore_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), TenantStateError> {
|
||||
ignore_tenant0(conf, &TENANTS, tenant_id).await
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(shard_id))]
|
||||
async fn ignore_tenant0(
|
||||
conf: &'static PageServerConf,
|
||||
tenants: &std::sync::RwLock<TenantsMap>,
|
||||
tenant_id: TenantId,
|
||||
) -> Result<(), TenantStateError> {
|
||||
// This is a legacy API (replaced by `/location_conf`). It does not support sharding
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
tracing::Span::current().record(
|
||||
"shard_id",
|
||||
tracing::field::display(tenant_shard_id.shard_slug()),
|
||||
);
|
||||
|
||||
remove_tenant_from_memory(tenants, tenant_shard_id, async {
|
||||
let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
|
||||
fs::File::create(&ignore_mark_file)
|
||||
.await
|
||||
.context("Failed to create ignore mark file")
|
||||
.and_then(|_| {
|
||||
crashsafe::fsync_file_and_parent(&ignore_mark_file)
|
||||
.context("Failed to fsync ignore mark file")
|
||||
})
|
||||
.with_context(|| format!("Failed to crate ignore mark for tenant {tenant_shard_id}"))?;
|
||||
Ok(())
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub(crate) enum TenantMapListError {
|
||||
#[error("tenant map is still initiailizing")]
|
||||
@@ -2180,6 +2337,10 @@ pub(crate) enum TenantSlotError {
|
||||
#[error("Tenant {0} not found")]
|
||||
NotFound(TenantShardId),
|
||||
|
||||
/// When acquiring a slot with the expectation that the tenant does not already exist.
|
||||
#[error("tenant {0} already exists, state: {1:?}")]
|
||||
AlreadyExists(TenantShardId, TenantState),
|
||||
|
||||
// Tried to read a slot that is currently being mutated by another administrative
|
||||
// operation.
|
||||
#[error("tenant has a state change in progress, try again later")]
|
||||
@@ -2495,6 +2656,8 @@ enum TenantSlotAcquireMode {
|
||||
Any,
|
||||
/// Return an error if trying to acquire a slot and it doesn't already exist
|
||||
MustExist,
|
||||
/// Return an error if trying to acquire a slot and it already exists
|
||||
MustNotExist,
|
||||
}
|
||||
|
||||
fn tenant_map_acquire_slot(
|
||||
@@ -2548,6 +2711,27 @@ fn tenant_map_acquire_slot_impl(
|
||||
tracing::debug!("Occupied, failing for InProgress");
|
||||
Err(TenantSlotError::InProgress)
|
||||
}
|
||||
(slot, MustNotExist) => match slot {
|
||||
TenantSlot::Attached(tenant) => {
|
||||
tracing::debug!("Attached && MustNotExist, return AlreadyExists");
|
||||
Err(TenantSlotError::AlreadyExists(
|
||||
*tenant_shard_id,
|
||||
tenant.current_state(),
|
||||
))
|
||||
}
|
||||
_ => {
|
||||
// FIXME: the AlreadyExists error assumes that we have a Tenant
|
||||
// to get the state from
|
||||
tracing::debug!("Occupied & MustNotExist, return AlreadyExists");
|
||||
Err(TenantSlotError::AlreadyExists(
|
||||
*tenant_shard_id,
|
||||
TenantState::Broken {
|
||||
reason: "Present but not attached".to_string(),
|
||||
backtrace: "".to_string(),
|
||||
},
|
||||
))
|
||||
}
|
||||
},
|
||||
_ => {
|
||||
// Happy case: the slot was not in any state that violated our mode
|
||||
let (completion, barrier) = utils::completion::channel();
|
||||
|
||||
@@ -101,7 +101,9 @@ use crate::{
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::keyspace::{KeyPartitioning, KeySpace};
|
||||
use crate::metrics::TimelineMetrics;
|
||||
use crate::metrics::{
|
||||
TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
|
||||
};
|
||||
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use pageserver_api::reltag::RelTag;
|
||||
@@ -118,6 +120,7 @@ use utils::{
|
||||
simple_rcu::{Rcu, RcuReadGuard},
|
||||
};
|
||||
|
||||
use crate::page_cache;
|
||||
use crate::repository::GcResult;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::task_mgr;
|
||||
@@ -131,7 +134,7 @@ use self::layer_manager::LayerManager;
|
||||
use self::logical_size::LogicalSize;
|
||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||
|
||||
use super::config::TenantConf;
|
||||
use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
|
||||
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
|
||||
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
|
||||
use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
|
||||
@@ -884,11 +887,32 @@ impl Timeline {
|
||||
|
||||
self.timeline_get_throttle.throttle(ctx, 1).await;
|
||||
|
||||
// Check the page cache. We will get back the most recent page with lsn <= `lsn`.
|
||||
// The cached image can be returned directly if there is no WAL between the cached image
|
||||
// and requested LSN. The cached image can also be used to reduce the amount of WAL needed
|
||||
// for redo.
|
||||
let cached_page_img = match self.lookup_cached_page(&key, lsn, ctx).await {
|
||||
Some((cached_lsn, cached_img)) => {
|
||||
match cached_lsn.cmp(&lsn) {
|
||||
Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
|
||||
Ordering::Equal => {
|
||||
MATERIALIZED_PAGE_CACHE_HIT_DIRECT.inc();
|
||||
return Ok(cached_img); // exact LSN match, return the image
|
||||
}
|
||||
Ordering::Greater => {
|
||||
unreachable!("the returned lsn should never be after the requested lsn")
|
||||
}
|
||||
}
|
||||
Some((cached_lsn, cached_img))
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
match self.conf.get_impl {
|
||||
GetImpl::Legacy => {
|
||||
let reconstruct_state = ValueReconstructState {
|
||||
records: Vec::new(),
|
||||
img: None,
|
||||
img: cached_page_img,
|
||||
};
|
||||
|
||||
self.get_impl(key, lsn, reconstruct_state, ctx).await
|
||||
@@ -902,6 +926,13 @@ impl Timeline {
|
||||
// entry returned above.
|
||||
let mut reconstruct_state = ValuesReconstructState::new();
|
||||
|
||||
// Only add the cached image to the reconstruct state when it exists.
|
||||
if cached_page_img.is_some() {
|
||||
let mut key_state = VectoredValueReconstructState::default();
|
||||
key_state.img = cached_page_img;
|
||||
reconstruct_state.keys.insert(key, Ok(key_state));
|
||||
}
|
||||
|
||||
let vectored_res = self
|
||||
.get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.await;
|
||||
@@ -3209,6 +3240,7 @@ impl Timeline {
|
||||
ValueReconstructResult::Continue => {
|
||||
// If we reached an earlier cached page image, we're done.
|
||||
if cont_lsn == cached_lsn + 1 {
|
||||
MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
|
||||
return Ok(traversal_path);
|
||||
}
|
||||
if let Some(prev) = prev_lsn {
|
||||
@@ -3582,6 +3614,26 @@ impl Timeline {
|
||||
})
|
||||
}
|
||||
|
||||
/// # Cancel-safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
async fn lookup_cached_page(
|
||||
&self,
|
||||
key: &Key,
|
||||
lsn: Lsn,
|
||||
ctx: &RequestContext,
|
||||
) -> Option<(Lsn, Bytes)> {
|
||||
let cache = page_cache::get();
|
||||
|
||||
// FIXME: It's pointless to check the cache for things that are not 8kB pages.
|
||||
// We should look at the key to determine if it's a cacheable object
|
||||
let (lsn, read_guard) = cache
|
||||
.lookup_materialized_page(self.tenant_shard_id, self.timeline_id, key, lsn, ctx)
|
||||
.await?;
|
||||
let img = Bytes::from(read_guard.to_vec());
|
||||
Some((lsn, img))
|
||||
}
|
||||
|
||||
async fn get_ready_ancestor_timeline(
|
||||
&self,
|
||||
ancestor: &Arc<Timeline>,
|
||||
@@ -5228,6 +5280,8 @@ impl Timeline {
|
||||
trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
|
||||
};
|
||||
|
||||
let last_rec_lsn = data.records.last().unwrap().0;
|
||||
|
||||
let img = match self
|
||||
.walredo_mgr
|
||||
.as_ref()
|
||||
@@ -5241,6 +5295,23 @@ impl Timeline {
|
||||
Err(e) => return Err(PageReconstructError::WalRedo(e)),
|
||||
};
|
||||
|
||||
if img.len() == page_cache::PAGE_SZ {
|
||||
let cache = page_cache::get();
|
||||
if let Err(e) = cache
|
||||
.memorize_materialized_page(
|
||||
self.tenant_shard_id,
|
||||
self.timeline_id,
|
||||
key,
|
||||
last_rec_lsn,
|
||||
&img,
|
||||
)
|
||||
.await
|
||||
.context("Materialized page memoization failed")
|
||||
{
|
||||
return Err(PageReconstructError::from(e));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(img)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,16 +12,15 @@ use std::ops::Deref;
|
||||
use std::path::Path;
|
||||
use std::time::Instant;
|
||||
|
||||
use crate::control_file_upgrade::downgrade_v9_to_v8;
|
||||
use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
|
||||
use crate::state::{EvictionState, TimelinePersistentState};
|
||||
use crate::state::TimelinePersistentState;
|
||||
use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir};
|
||||
use utils::{bin_ser::LeSer, id::TenantTimelineId};
|
||||
|
||||
use crate::SafeKeeperConf;
|
||||
|
||||
pub const SK_MAGIC: u32 = 0xcafeceefu32;
|
||||
pub const SK_FORMAT_VERSION: u32 = 9;
|
||||
pub const SK_FORMAT_VERSION: u32 = 8;
|
||||
|
||||
// contains persistent metadata for safekeeper
|
||||
pub const CONTROL_FILE_NAME: &str = "safekeeper.control";
|
||||
@@ -179,18 +178,8 @@ impl Storage for FileStorage {
|
||||
})?;
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
|
||||
|
||||
if s.eviction_state == EvictionState::Present {
|
||||
// temp hack for forward compatibility
|
||||
const PREV_FORMAT_VERSION: u32 = 8;
|
||||
let prev = downgrade_v9_to_v8(s);
|
||||
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, PREV_FORMAT_VERSION)?;
|
||||
prev.ser_into(&mut buf)?;
|
||||
} else {
|
||||
// otherwise, we write the current format version
|
||||
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
|
||||
s.ser_into(&mut buf)?;
|
||||
}
|
||||
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
|
||||
s.ser_into(&mut buf)?;
|
||||
|
||||
// calculate checksum before resize
|
||||
let checksum = crc32c::crc32c(&buf);
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
//! Code to deal with safekeeper control file upgrades
|
||||
use crate::{
|
||||
safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn},
|
||||
state::{EvictionState, PersistedPeers, TimelinePersistentState},
|
||||
state::{PersistedPeers, TimelinePersistentState},
|
||||
wal_backup_partial,
|
||||
};
|
||||
use anyhow::{bail, Result};
|
||||
@@ -183,55 +183,6 @@ pub struct SafeKeeperStateV7 {
|
||||
pub peers: PersistedPeers,
|
||||
}
|
||||
|
||||
/// Persistent information stored on safekeeper node about timeline.
|
||||
/// On disk data is prefixed by magic and format version and followed by checksum.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct SafeKeeperStateV8 {
|
||||
#[serde(with = "hex")]
|
||||
pub tenant_id: TenantId,
|
||||
#[serde(with = "hex")]
|
||||
pub timeline_id: TimelineId,
|
||||
/// persistent acceptor state
|
||||
pub acceptor_state: AcceptorState,
|
||||
/// information about server
|
||||
pub server: ServerInfo,
|
||||
/// Unique id of the last *elected* proposer we dealt with. Not needed
|
||||
/// for correctness, exists for monitoring purposes.
|
||||
#[serde(with = "hex")]
|
||||
pub proposer_uuid: PgUuid,
|
||||
/// Since which LSN this timeline generally starts. Safekeeper might have
|
||||
/// joined later.
|
||||
pub timeline_start_lsn: Lsn,
|
||||
/// Since which LSN safekeeper has (had) WAL for this timeline.
|
||||
/// All WAL segments next to one containing local_start_lsn are
|
||||
/// filled with data from the beginning.
|
||||
pub local_start_lsn: Lsn,
|
||||
/// Part of WAL acknowledged by quorum *and available locally*. Always points
|
||||
/// to record boundary.
|
||||
pub commit_lsn: Lsn,
|
||||
/// LSN that points to the end of the last backed up segment. Useful to
|
||||
/// persist to avoid finding out offloading progress on boot.
|
||||
pub backup_lsn: Lsn,
|
||||
/// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
|
||||
/// of last record streamed to everyone). Persisting it helps skipping
|
||||
/// recovery in walproposer, generally we compute it from peers. In
|
||||
/// walproposer proto called 'truncate_lsn'. Updates are currently drived
|
||||
/// only by walproposer.
|
||||
pub peer_horizon_lsn: Lsn,
|
||||
/// LSN of the oldest known checkpoint made by pageserver and successfully
|
||||
/// pushed to s3. We don't remove WAL beyond it. Persisted only for
|
||||
/// informational purposes, we receive it from pageserver (or broker).
|
||||
pub remote_consistent_lsn: Lsn,
|
||||
/// Peers and their state as we remember it. Knowing peers themselves is
|
||||
/// fundamental; but state is saved here only for informational purposes and
|
||||
/// obviously can be stale. (Currently not saved at all, but let's provision
|
||||
/// place to have less file version upgrades).
|
||||
pub peers: PersistedPeers,
|
||||
/// Holds names of partial segments uploaded to remote storage. Used to
|
||||
/// clean up old objects without leaving garbage in remote storage.
|
||||
pub partial_backup: wal_backup_partial::State,
|
||||
}
|
||||
|
||||
pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersistentState> {
|
||||
// migrate to storing full term history
|
||||
if version == 1 {
|
||||
@@ -262,7 +213,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
});
|
||||
// migrate to hexing some ids
|
||||
} else if version == 2 {
|
||||
@@ -287,7 +237,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
});
|
||||
// migrate to moving tenant_id/timeline_id to the top and adding some lsns
|
||||
} else if version == 3 {
|
||||
@@ -312,7 +261,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
});
|
||||
// migrate to having timeline_start_lsn
|
||||
} else if version == 4 {
|
||||
@@ -337,7 +285,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
remote_consistent_lsn: Lsn(0),
|
||||
peers: PersistedPeers(vec![]),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
});
|
||||
} else if version == 5 {
|
||||
info!("reading safekeeper control file version {}", version);
|
||||
@@ -382,26 +329,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
remote_consistent_lsn: oldstate.remote_consistent_lsn,
|
||||
peers: oldstate.peers,
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
});
|
||||
} else if version == 8 {
|
||||
let oldstate = SafeKeeperStateV8::des(&buf[..buf.len()])?;
|
||||
|
||||
return Ok(TimelinePersistentState {
|
||||
tenant_id: oldstate.tenant_id,
|
||||
timeline_id: oldstate.timeline_id,
|
||||
acceptor_state: oldstate.acceptor_state,
|
||||
server: oldstate.server,
|
||||
proposer_uuid: oldstate.proposer_uuid,
|
||||
timeline_start_lsn: oldstate.timeline_start_lsn,
|
||||
local_start_lsn: oldstate.local_start_lsn,
|
||||
commit_lsn: oldstate.commit_lsn,
|
||||
backup_lsn: oldstate.backup_lsn,
|
||||
peer_horizon_lsn: oldstate.peer_horizon_lsn,
|
||||
remote_consistent_lsn: oldstate.remote_consistent_lsn,
|
||||
peers: oldstate.peers,
|
||||
partial_backup: oldstate.partial_backup,
|
||||
eviction_state: EvictionState::Present,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -411,25 +338,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
|
||||
bail!("unsupported safekeeper control file version {}", version)
|
||||
}
|
||||
|
||||
pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8 {
|
||||
assert!(state.eviction_state == EvictionState::Present);
|
||||
SafeKeeperStateV8 {
|
||||
tenant_id: state.tenant_id,
|
||||
timeline_id: state.timeline_id,
|
||||
acceptor_state: state.acceptor_state.clone(),
|
||||
server: state.server.clone(),
|
||||
proposer_uuid: state.proposer_uuid,
|
||||
timeline_start_lsn: state.timeline_start_lsn,
|
||||
local_start_lsn: state.local_start_lsn,
|
||||
commit_lsn: state.commit_lsn,
|
||||
backup_lsn: state.backup_lsn,
|
||||
peer_horizon_lsn: state.peer_horizon_lsn,
|
||||
remote_consistent_lsn: state.remote_consistent_lsn,
|
||||
peers: state.peers.clone(),
|
||||
partial_backup: state.partial_backup.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::str::FromStr;
|
||||
|
||||
@@ -958,7 +958,7 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::{
|
||||
state::{EvictionState, PersistedPeers, TimelinePersistentState},
|
||||
state::{PersistedPeers, TimelinePersistentState},
|
||||
wal_storage::Storage,
|
||||
};
|
||||
use std::{ops::Deref, str::FromStr, time::Instant};
|
||||
@@ -1225,7 +1225,6 @@ mod tests {
|
||||
},
|
||||
)]),
|
||||
partial_backup: crate::wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
};
|
||||
|
||||
let ser = state.ser().unwrap();
|
||||
@@ -1273,8 +1272,6 @@ mod tests {
|
||||
0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
|
||||
// partial_backup
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
// eviction_state
|
||||
0x00, 0x00, 0x00, 0x00,
|
||||
];
|
||||
|
||||
assert_eq!(Hex(&ser), Hex(&expected));
|
||||
|
||||
@@ -63,26 +63,11 @@ pub struct TimelinePersistentState {
|
||||
/// Holds names of partial segments uploaded to remote storage. Used to
|
||||
/// clean up old objects without leaving garbage in remote storage.
|
||||
pub partial_backup: wal_backup_partial::State,
|
||||
/// Eviction state of the timeline. If it's Offloaded, we should download
|
||||
/// WAL files from remote storage to serve the timeline.
|
||||
pub eviction_state: EvictionState,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
|
||||
pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);
|
||||
|
||||
/// State of the local WAL files. Used to track current timeline state,
|
||||
/// that can be either WAL files are present on disk or last partial segment
|
||||
/// is offloaded to remote storage.
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
|
||||
pub enum EvictionState {
|
||||
/// WAL files are present on disk.
|
||||
Present,
|
||||
/// Last partial segment is offloaded to remote storage.
|
||||
/// Contains flush_lsn of the last offloaded segment.
|
||||
Offloaded(Lsn),
|
||||
}
|
||||
|
||||
impl TimelinePersistentState {
|
||||
pub fn new(
|
||||
ttid: &TenantTimelineId,
|
||||
@@ -113,7 +98,6 @@ impl TimelinePersistentState {
|
||||
.collect(),
|
||||
),
|
||||
partial_backup: wal_backup_partial::State::default(),
|
||||
eviction_state: EvictionState::Present,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -8,15 +8,14 @@ use crate::service::RECONCILE_TIMEOUT;
|
||||
|
||||
const LOCK_TIMEOUT_ALERT_THRESHOLD: Duration = RECONCILE_TIMEOUT;
|
||||
|
||||
/// A wrapper around `OwnedRwLockWriteGuard` used for tracking the
|
||||
/// operation that holds the lock, and print a warning if it exceeds
|
||||
/// the LOCK_TIMEOUT_ALERT_THRESHOLD time
|
||||
pub struct TracingExclusiveGuard<T: Display> {
|
||||
/// A wrapper around `OwnedRwLockWriteGuard` that when dropped changes the
|
||||
/// current holding operation in lock.
|
||||
pub struct WrappedWriteGuard<T: Display> {
|
||||
guard: tokio::sync::OwnedRwLockWriteGuard<Option<T>>,
|
||||
start: Instant,
|
||||
}
|
||||
|
||||
impl<T: Display> TracingExclusiveGuard<T> {
|
||||
impl<T: Display> WrappedWriteGuard<T> {
|
||||
pub fn new(guard: tokio::sync::OwnedRwLockWriteGuard<Option<T>>) -> Self {
|
||||
Self {
|
||||
guard,
|
||||
@@ -25,12 +24,12 @@ impl<T: Display> TracingExclusiveGuard<T> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Display> Drop for TracingExclusiveGuard<T> {
|
||||
impl<T: Display> Drop for WrappedWriteGuard<T> {
|
||||
fn drop(&mut self) {
|
||||
let duration = self.start.elapsed();
|
||||
if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
|
||||
tracing::warn!(
|
||||
"Exclusive lock by {} was held for {:?}",
|
||||
"Lock on {} was held for {:?}",
|
||||
self.guard.as_ref().unwrap(),
|
||||
duration
|
||||
);
|
||||
@@ -39,38 +38,6 @@ impl<T: Display> Drop for TracingExclusiveGuard<T> {
|
||||
}
|
||||
}
|
||||
|
||||
// A wrapper around `OwnedRwLockReadGuard` used for tracking the
|
||||
/// operation that holds the lock, and print a warning if it exceeds
|
||||
/// the LOCK_TIMEOUT_ALERT_THRESHOLD time
|
||||
pub struct TracingSharedGuard<T: Display> {
|
||||
_guard: tokio::sync::OwnedRwLockReadGuard<Option<T>>,
|
||||
operation: T,
|
||||
start: Instant,
|
||||
}
|
||||
|
||||
impl<T: Display> TracingSharedGuard<T> {
|
||||
pub fn new(guard: tokio::sync::OwnedRwLockReadGuard<Option<T>>, operation: T) -> Self {
|
||||
Self {
|
||||
_guard: guard,
|
||||
operation,
|
||||
start: Instant::now(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: Display> Drop for TracingSharedGuard<T> {
|
||||
fn drop(&mut self) {
|
||||
let duration = self.start.elapsed();
|
||||
if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
|
||||
tracing::warn!(
|
||||
"Shared lock by {} was held for {:?}",
|
||||
self.operation,
|
||||
duration
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
|
||||
/// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
|
||||
/// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
|
||||
@@ -91,22 +58,21 @@ where
|
||||
pub(crate) fn shared(
|
||||
&self,
|
||||
key: T,
|
||||
operation: I,
|
||||
) -> impl std::future::Future<Output = TracingSharedGuard<I>> {
|
||||
) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<Option<I>>> {
|
||||
let mut locked = self.entities.lock().unwrap();
|
||||
let entry = locked.entry(key).or_default().clone();
|
||||
async move { TracingSharedGuard::new(entry.read_owned().await, operation) }
|
||||
let entry = locked.entry(key).or_default();
|
||||
entry.clone().read_owned()
|
||||
}
|
||||
|
||||
pub(crate) fn exclusive(
|
||||
&self,
|
||||
key: T,
|
||||
operation: I,
|
||||
) -> impl std::future::Future<Output = TracingExclusiveGuard<I>> {
|
||||
) -> impl std::future::Future<Output = WrappedWriteGuard<I>> {
|
||||
let mut locked = self.entities.lock().unwrap();
|
||||
let entry = locked.entry(key).or_default().clone();
|
||||
async move {
|
||||
let mut guard = TracingExclusiveGuard::new(entry.write_owned().await);
|
||||
let mut guard = WrappedWriteGuard::new(entry.clone().write_owned().await);
|
||||
*guard.guard = Some(operation);
|
||||
guard
|
||||
}
|
||||
@@ -133,12 +99,12 @@ where
|
||||
|
||||
pub async fn trace_exclusive_lock<
|
||||
T: Clone + Display + Eq + PartialEq + std::hash::Hash,
|
||||
I: Clone + Display,
|
||||
I: Display + Clone,
|
||||
>(
|
||||
op_locks: &IdLockMap<T, I>,
|
||||
key: T,
|
||||
operation: I,
|
||||
) -> TracingExclusiveGuard<I> {
|
||||
) -> WrappedWriteGuard<I> {
|
||||
let start = Instant::now();
|
||||
let guard = op_locks.exclusive(key.clone(), operation.clone()).await;
|
||||
|
||||
@@ -157,14 +123,14 @@ pub async fn trace_exclusive_lock<
|
||||
|
||||
pub async fn trace_shared_lock<
|
||||
T: Clone + Display + Eq + PartialEq + std::hash::Hash,
|
||||
I: Clone + Display,
|
||||
I: Display,
|
||||
>(
|
||||
op_locks: &IdLockMap<T, I>,
|
||||
key: T,
|
||||
operation: I,
|
||||
) -> TracingSharedGuard<I> {
|
||||
) -> tokio::sync::OwnedRwLockReadGuard<Option<I>> {
|
||||
let start = Instant::now();
|
||||
let guard = op_locks.shared(key.clone(), operation.clone()).await;
|
||||
let guard = op_locks.shared(key.clone()).await;
|
||||
|
||||
let duration = start.elapsed();
|
||||
if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
|
||||
@@ -193,11 +159,11 @@ mod tests {
|
||||
async fn multiple_shared_locks() {
|
||||
let id_lock_map: IdLockMap<i32, Operations> = IdLockMap::default();
|
||||
|
||||
let shared_lock_1 = id_lock_map.shared(1, Operations::Op1).await;
|
||||
let shared_lock_2 = id_lock_map.shared(1, Operations::Op2).await;
|
||||
let shared_lock_1 = id_lock_map.shared(1).await;
|
||||
let shared_lock_2 = id_lock_map.shared(1).await;
|
||||
|
||||
assert_eq!(shared_lock_1.operation, Operations::Op1);
|
||||
assert_eq!(shared_lock_2.operation, Operations::Op2);
|
||||
assert!(shared_lock_1.is_none());
|
||||
assert!(shared_lock_2.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -217,7 +183,7 @@ mod tests {
|
||||
assert!(_ex_lock_2.is_err());
|
||||
}
|
||||
|
||||
let shared_lock_1 = id_lock_map.shared(resource_id, Operations::Op1).await;
|
||||
assert_eq!(shared_lock_1.operation, Operations::Op1);
|
||||
let shared_lock_1 = id_lock_map.shared(resource_id).await;
|
||||
assert!(shared_lock_1.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -391,7 +391,7 @@ impl Scheduler {
|
||||
return Err(ScheduleError::NoPageservers);
|
||||
}
|
||||
|
||||
let mut scores: Vec<(NodeId, AffinityScore, usize, usize)> = self
|
||||
let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
|
||||
.nodes
|
||||
.iter()
|
||||
.filter_map(|(k, v)| {
|
||||
@@ -402,7 +402,6 @@ impl Scheduler {
|
||||
*k,
|
||||
context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
|
||||
v.shard_count,
|
||||
v.attached_shard_count,
|
||||
))
|
||||
}
|
||||
})
|
||||
@@ -410,12 +409,9 @@ impl Scheduler {
|
||||
|
||||
// Sort by, in order of precedence:
|
||||
// 1st: Affinity score. We should never pick a higher-score node if a lower-score node is available
|
||||
// 2nd: Attached shard count. Within nodes with the same affinity, we always pick the node with
|
||||
// the least number of attached shards.
|
||||
// 3rd: Total shard count. Within nodes with the same affinity and attached shard count, use nodes
|
||||
// with the lower total shard count.
|
||||
// 4th: Node ID. This is a convenience to make selection deterministic in tests and empty systems.
|
||||
scores.sort_by_key(|i| (i.1, i.3, i.2, i.0));
|
||||
// 2nd: Utilization. Within nodes with the same affinity, use the least loaded nodes.
|
||||
// 3rd: Node ID. This is a convenience to make selection deterministic in tests and empty systems.
|
||||
scores.sort_by_key(|i| (i.1, i.2, i.0));
|
||||
|
||||
if scores.is_empty() {
|
||||
// After applying constraints, no pageservers were left.
|
||||
|
||||
@@ -13,7 +13,7 @@ use crate::{
|
||||
Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION,
|
||||
},
|
||||
compute_hook::NotifyError,
|
||||
id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
|
||||
id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard},
|
||||
persistence::{AbortShardSplitStatus, TenantFilter},
|
||||
reconciler::{ReconcileError, ReconcileUnits},
|
||||
scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
|
||||
@@ -359,7 +359,7 @@ struct TenantShardSplitAbort {
|
||||
new_shard_count: ShardCount,
|
||||
new_stripe_size: Option<ShardStripeSize>,
|
||||
/// Until this abort op is complete, no other operations may be done on the tenant
|
||||
_tenant_lock: TracingExclusiveGuard<TenantOperations>,
|
||||
_tenant_lock: WrappedWriteGuard<TenantOperations>,
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
@@ -1429,7 +1429,7 @@ impl Service {
|
||||
async fn node_activate_reconcile(
|
||||
&self,
|
||||
mut node: Node,
|
||||
_lock: &TracingExclusiveGuard<NodeOperations>,
|
||||
_lock: &WrappedWriteGuard<NodeOperations>,
|
||||
) -> Result<(), ApiError> {
|
||||
// This Node is a mutable local copy: we will set it active so that we can use its
|
||||
// API client to reconcile with the node. The Node in [`Self::nodes`] will get updated
|
||||
@@ -2658,7 +2658,6 @@ impl Service {
|
||||
TenantOperations::TimelineCreate,
|
||||
)
|
||||
.await;
|
||||
failpoint_support::sleep_millis_async!("tenant-create-timeline-shared-lock");
|
||||
|
||||
self.ensure_attached_wait(tenant_id).await?;
|
||||
|
||||
@@ -5323,6 +5322,14 @@ impl Service {
|
||||
}
|
||||
};
|
||||
|
||||
// Reset the scheduling context if we have moved over to a new tenant.
|
||||
// This is required since the affinity scores stored in the scheduling
|
||||
// context should be tenant specific. Note that we are relying on
|
||||
// [`ServiceState::tenants`] being ordered by tenant id.
|
||||
if last_inspected_shard.map(|tid| tid.tenant_id) != Some(tid.tenant_id) {
|
||||
schedule_context = ScheduleContext::default();
|
||||
}
|
||||
|
||||
if tenant_shard.intent.demote_attached(scheduler, node_id) {
|
||||
match tenant_shard.schedule(scheduler, &mut schedule_context) {
|
||||
Err(e) => {
|
||||
@@ -5395,9 +5402,6 @@ impl Service {
|
||||
/// throughout the cluster. We achieve this by picking tenant shards from each node,
|
||||
/// starting from the ones with the largest number of attached shards, until the node
|
||||
/// reaches the expected cluster average.
|
||||
/// 3. Avoid promoting more shards of the same tenant than required. The upper bound
|
||||
/// for the number of tenants from the same shard promoted to the node being filled is:
|
||||
/// shard count for the tenant divided by the number of nodes in the cluster.
|
||||
fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
|
||||
let mut locked = self.inner.write().unwrap();
|
||||
let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
|
||||
@@ -5419,18 +5423,8 @@ impl Service {
|
||||
let expected_attached = locked.scheduler.expected_attached_shard_count();
|
||||
let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();
|
||||
|
||||
let mut promoted_per_tenant: HashMap<TenantId, usize> = HashMap::new();
|
||||
let mut plan = Vec::new();
|
||||
|
||||
for (node_id, attached) in nodes_by_load {
|
||||
let available = locked
|
||||
.nodes
|
||||
.get(&node_id)
|
||||
.map_or(false, |n| n.is_available());
|
||||
if !available {
|
||||
continue;
|
||||
}
|
||||
|
||||
if plan.len() >= fill_requirement
|
||||
|| tids_by_node.is_empty()
|
||||
|| attached <= expected_attached
|
||||
@@ -5438,22 +5432,13 @@ impl Service {
|
||||
break;
|
||||
}
|
||||
|
||||
let mut can_take = attached - expected_attached;
|
||||
let can_take = attached - expected_attached;
|
||||
let mut remove_node = false;
|
||||
while can_take > 0 {
|
||||
for _ in 0..can_take {
|
||||
match tids_by_node.get_mut(&node_id) {
|
||||
Some(tids) => match tids.pop() {
|
||||
Some(tid) => {
|
||||
let max_promote_for_tenant = std::cmp::max(
|
||||
tid.shard_count.count() as usize / locked.nodes.len(),
|
||||
1,
|
||||
);
|
||||
let promoted = promoted_per_tenant.entry(tid.tenant_id).or_default();
|
||||
if *promoted < max_promote_for_tenant {
|
||||
plan.push(tid);
|
||||
*promoted += 1;
|
||||
can_take -= 1;
|
||||
}
|
||||
plan.push(tid);
|
||||
}
|
||||
None => {
|
||||
remove_node = true;
|
||||
@@ -5515,8 +5500,17 @@ impl Service {
|
||||
));
|
||||
}
|
||||
|
||||
let mut last_inspected_tenant = None;
|
||||
while waiters.len() < MAX_RECONCILES_PER_OPERATION {
|
||||
if let Some(tid) = tids_to_promote.pop() {
|
||||
// Reset the scheduling context if we have moved over to a new tenant.
|
||||
// This is required since the affinity scores stored in the scheduling
|
||||
// context should be tenant specific. Note that we are relying on the
|
||||
// result [`Service::fill_node_plan`] being ordered by tenant id.
|
||||
if last_inspected_tenant != Some(tid.tenant_id) {
|
||||
schedule_context = ScheduleContext::default();
|
||||
}
|
||||
|
||||
if let Some(tenant_shard) = tenants.get_mut(&tid) {
|
||||
// If the node being filled is not a secondary anymore,
|
||||
// skip the promotion.
|
||||
@@ -5551,6 +5545,8 @@ impl Service {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
last_inspected_tenant = Some(tid.tenant_id);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -1632,10 +1632,14 @@ pub(crate) mod tests {
|
||||
|
||||
// We should see equal number of locations on the two nodes.
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 2);
|
||||
// Scheduling does not consider the number of attachments picking the initial
|
||||
// pageserver to attach to (hence the assertion that all primaries are on the
|
||||
// same node)
|
||||
// TODO: Tweak the scheduling to evenly distribute attachments for new shards.
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 4);
|
||||
|
||||
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2);
|
||||
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);
|
||||
|
||||
// Add another two nodes: we should see the shards spread out when their optimize
|
||||
// methods are called
|
||||
|
||||
@@ -118,6 +118,8 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
|
||||
"libmetrics_launch_timestamp",
|
||||
"libmetrics_build_info",
|
||||
"libmetrics_tracing_event_count_total",
|
||||
"pageserver_materialized_cache_hits_total",
|
||||
"pageserver_materialized_cache_hits_direct_total",
|
||||
"pageserver_page_cache_read_hits_total",
|
||||
"pageserver_page_cache_read_accesses_total",
|
||||
"pageserver_page_cache_size_current_bytes",
|
||||
|
||||
@@ -1177,10 +1177,10 @@ class NeonEnv:
|
||||
force=config.config_init_force,
|
||||
)
|
||||
|
||||
def start(self, timeout_in_seconds: Optional[int] = None):
|
||||
def start(self):
|
||||
# Storage controller starts first, so that pageserver /re-attach calls don't
|
||||
# bounce through retries on startup
|
||||
self.storage_controller.start(timeout_in_seconds=timeout_in_seconds)
|
||||
self.storage_controller.start()
|
||||
|
||||
# Wait for storage controller readiness to prevent unnecessary post start-up
|
||||
# reconcile.
|
||||
@@ -1196,18 +1196,10 @@ class NeonEnv:
|
||||
) # The `or None` is for the linter
|
||||
|
||||
for pageserver in self.pageservers:
|
||||
futs.append(
|
||||
executor.submit(
|
||||
lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds)
|
||||
)
|
||||
)
|
||||
futs.append(executor.submit(lambda ps=pageserver: ps.start()))
|
||||
|
||||
for safekeeper in self.safekeepers:
|
||||
futs.append(
|
||||
executor.submit(
|
||||
lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds)
|
||||
)
|
||||
)
|
||||
futs.append(executor.submit(lambda sk=safekeeper: sk.start()))
|
||||
|
||||
for f in futs:
|
||||
f.result()
|
||||
@@ -1791,13 +1783,8 @@ class NeonCli(AbstractNeonCli):
|
||||
res.check_returncode()
|
||||
return res
|
||||
|
||||
def storage_controller_start(
|
||||
self,
|
||||
timeout_in_seconds: Optional[int] = None,
|
||||
):
|
||||
def storage_controller_start(self):
|
||||
cmd = ["storage_controller", "start"]
|
||||
if timeout_in_seconds is not None:
|
||||
cmd.append(f"--start-timeout={timeout_in_seconds}s")
|
||||
return self.raw_cli(cmd)
|
||||
|
||||
def storage_controller_stop(self, immediate: bool):
|
||||
@@ -1810,11 +1797,8 @@ class NeonCli(AbstractNeonCli):
|
||||
self,
|
||||
id: int,
|
||||
extra_env_vars: Optional[Dict[str, str]] = None,
|
||||
timeout_in_seconds: Optional[int] = None,
|
||||
) -> "subprocess.CompletedProcess[str]":
|
||||
start_args = ["pageserver", "start", f"--id={id}"]
|
||||
if timeout_in_seconds is not None:
|
||||
start_args.append(f"--start-timeout={timeout_in_seconds}s")
|
||||
storage = self.env.pageserver_remote_storage
|
||||
|
||||
if isinstance(storage, S3Storage):
|
||||
@@ -1832,10 +1816,7 @@ class NeonCli(AbstractNeonCli):
|
||||
return self.raw_cli(cmd)
|
||||
|
||||
def safekeeper_start(
|
||||
self,
|
||||
id: int,
|
||||
extra_opts: Optional[List[str]] = None,
|
||||
timeout_in_seconds: Optional[int] = None,
|
||||
self, id: int, extra_opts: Optional[List[str]] = None
|
||||
) -> "subprocess.CompletedProcess[str]":
|
||||
s3_env_vars = None
|
||||
if isinstance(self.env.safekeepers_remote_storage, S3Storage):
|
||||
@@ -1845,8 +1826,6 @@ class NeonCli(AbstractNeonCli):
|
||||
extra_opts = [f"-e={opt}" for opt in extra_opts]
|
||||
else:
|
||||
extra_opts = []
|
||||
if timeout_in_seconds is not None:
|
||||
extra_opts.append(f"--start-timeout={timeout_in_seconds}s")
|
||||
return self.raw_cli(
|
||||
["safekeeper", "start", str(id), *extra_opts], extra_env_vars=s3_env_vars
|
||||
)
|
||||
@@ -2098,9 +2077,9 @@ class NeonStorageController(MetricsGetter, LogUtils):
|
||||
self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
|
||||
self.logfile = self.workdir / "storage_controller.log"
|
||||
|
||||
def start(self, timeout_in_seconds: Optional[int] = None):
|
||||
def start(self):
|
||||
assert not self.running
|
||||
self.env.neon_cli.storage_controller_start(timeout_in_seconds)
|
||||
self.env.neon_cli.storage_controller_start()
|
||||
self.running = True
|
||||
return self
|
||||
|
||||
@@ -2552,7 +2531,6 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
def start(
|
||||
self,
|
||||
extra_env_vars: Optional[Dict[str, str]] = None,
|
||||
timeout_in_seconds: Optional[int] = None,
|
||||
) -> "NeonPageserver":
|
||||
"""
|
||||
Start the page server.
|
||||
@@ -2561,9 +2539,7 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
"""
|
||||
assert self.running is False
|
||||
|
||||
self.env.neon_cli.pageserver_start(
|
||||
self.id, extra_env_vars=extra_env_vars, timeout_in_seconds=timeout_in_seconds
|
||||
)
|
||||
self.env.neon_cli.pageserver_start(self.id, extra_env_vars=extra_env_vars)
|
||||
self.running = True
|
||||
return self
|
||||
|
||||
@@ -2577,17 +2553,13 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
self.running = False
|
||||
return self
|
||||
|
||||
def restart(
|
||||
self,
|
||||
immediate: bool = False,
|
||||
timeout_in_seconds: Optional[int] = None,
|
||||
):
|
||||
def restart(self, immediate: bool = False):
|
||||
"""
|
||||
High level wrapper for restart: restarts the process, and waits for
|
||||
tenant state to stabilize.
|
||||
"""
|
||||
self.stop(immediate=immediate)
|
||||
self.start(timeout_in_seconds=timeout_in_seconds)
|
||||
self.start()
|
||||
self.quiesce_tenants()
|
||||
|
||||
def quiesce_tenants(self):
|
||||
@@ -2728,6 +2700,12 @@ class NeonPageserver(PgProtocol, LogUtils):
|
||||
client = self.http_client(auth_token=auth_token)
|
||||
return client.tenant_create(tenant_id, conf, generation=generation)
|
||||
|
||||
def tenant_load(self, tenant_id: TenantId):
|
||||
client = self.http_client()
|
||||
return client.tenant_load(
|
||||
tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
|
||||
)
|
||||
|
||||
def list_layers(
|
||||
self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
|
||||
) -> list[Path]:
|
||||
@@ -3468,12 +3446,11 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers))
|
||||
# path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf
|
||||
|
||||
# Semaphore is set to 1 when we start, and acquire'd back to zero when we stop
|
||||
#
|
||||
# We use a semaphore rather than a bool so that racing calls to stop() don't
|
||||
# try and stop the same process twice, as stop() is called by test teardown and
|
||||
# potentially by some __del__ chains in other threads.
|
||||
self._running = threading.Semaphore(0)
|
||||
# This lock prevents concurrent start & stop operations, keeping `self.running` consistent
|
||||
# with whether we're really running. Tests generally wouldn't try and do these concurrently,
|
||||
# but endpoints are also stopped during test teardown, which might happen concurrently with
|
||||
# destruction of objects in tests.
|
||||
self.lock = threading.Lock()
|
||||
|
||||
def http_client(
|
||||
self, auth_token: Optional[str] = None, retries: Optional[Retry] = None
|
||||
@@ -3545,14 +3522,15 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
|
||||
log.info(f"Starting postgres endpoint {self.endpoint_id}")
|
||||
|
||||
self.env.neon_cli.endpoint_start(
|
||||
self.endpoint_id,
|
||||
safekeepers=self.active_safekeepers,
|
||||
remote_ext_config=remote_ext_config,
|
||||
pageserver_id=pageserver_id,
|
||||
allow_multiple=allow_multiple,
|
||||
)
|
||||
self._running.release(1)
|
||||
with self.lock:
|
||||
self.env.neon_cli.endpoint_start(
|
||||
self.endpoint_id,
|
||||
safekeepers=self.active_safekeepers,
|
||||
remote_ext_config=remote_ext_config,
|
||||
pageserver_id=pageserver_id,
|
||||
allow_multiple=allow_multiple,
|
||||
)
|
||||
self.running = True
|
||||
|
||||
return self
|
||||
|
||||
@@ -3600,12 +3578,9 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
conf_file.write("\n".join(hba) + "\n")
|
||||
conf_file.write(data)
|
||||
|
||||
if self.is_running():
|
||||
if self.running:
|
||||
self.safe_psql("SELECT pg_reload_conf()")
|
||||
|
||||
def is_running(self):
|
||||
return self._running._value > 0
|
||||
|
||||
def reconfigure(self, pageserver_id: Optional[int] = None):
|
||||
assert self.endpoint_id is not None
|
||||
self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id)
|
||||
@@ -3654,12 +3629,13 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
Returns self.
|
||||
"""
|
||||
|
||||
running = self._running.acquire(blocking=False)
|
||||
if running:
|
||||
assert self.endpoint_id is not None
|
||||
self.env.neon_cli.endpoint_stop(
|
||||
self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
|
||||
)
|
||||
with self.lock:
|
||||
if self.running:
|
||||
assert self.endpoint_id is not None
|
||||
self.env.neon_cli.endpoint_stop(
|
||||
self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
|
||||
)
|
||||
self.running = False
|
||||
|
||||
return self
|
||||
|
||||
@@ -3669,13 +3645,13 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
Returns self.
|
||||
"""
|
||||
|
||||
running = self._running.acquire(blocking=False)
|
||||
if running:
|
||||
with self.lock:
|
||||
assert self.endpoint_id is not None
|
||||
self.env.neon_cli.endpoint_stop(
|
||||
self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode
|
||||
)
|
||||
self.endpoint_id = None
|
||||
self.running = False
|
||||
|
||||
return self
|
||||
|
||||
@@ -3863,13 +3839,9 @@ class Safekeeper(LogUtils):
|
||||
self.running = running
|
||||
self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"
|
||||
|
||||
def start(
|
||||
self, extra_opts: Optional[List[str]] = None, timeout_in_seconds: Optional[int] = None
|
||||
) -> "Safekeeper":
|
||||
def start(self, extra_opts: Optional[List[str]] = None) -> "Safekeeper":
|
||||
assert self.running is False
|
||||
self.env.neon_cli.safekeeper_start(
|
||||
self.id, extra_opts=extra_opts, timeout_in_seconds=timeout_in_seconds
|
||||
)
|
||||
self.env.neon_cli.safekeeper_start(self.id, extra_opts=extra_opts)
|
||||
self.running = True
|
||||
# wait for wal acceptor start by checking its status
|
||||
started_at = time.time()
|
||||
|
||||
@@ -340,6 +340,17 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
self.verbose_error(res)
|
||||
return res
|
||||
|
||||
def tenant_load(self, tenant_id: TenantId, generation=None):
|
||||
body = None
|
||||
if generation is not None:
|
||||
body = {"generation": generation}
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load", json=body)
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_ignore(self, tenant_id: TenantId):
|
||||
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
|
||||
self.verbose_error(res)
|
||||
|
||||
def tenant_status(
|
||||
self, tenant_id: Union[TenantId, TenantShardId], activate: bool = False
|
||||
) -> Dict[Any, Any]:
|
||||
|
||||
@@ -430,6 +430,52 @@ def enable_remote_storage_versioning(
|
||||
return response
|
||||
|
||||
|
||||
def wait_tenant_status_404(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
iterations: int,
|
||||
interval: float = 0.250,
|
||||
):
|
||||
def tenant_is_missing():
|
||||
data = {}
|
||||
try:
|
||||
data = pageserver_http.tenant_status(tenant_id)
|
||||
log.info(f"tenant status {data}")
|
||||
except PageserverApiException as e:
|
||||
log.debug(e)
|
||||
if e.status_code == 404:
|
||||
return
|
||||
|
||||
raise RuntimeError(f"Timeline exists state {data.get('state')}")
|
||||
|
||||
wait_until(iterations, interval=interval, func=tenant_is_missing)
|
||||
|
||||
|
||||
def tenant_delete_wait_completed(
|
||||
pageserver_http: PageserverHttpClient,
|
||||
tenant_id: TenantId,
|
||||
iterations: int,
|
||||
ignore_errors: bool = False,
|
||||
):
|
||||
if not ignore_errors:
|
||||
pageserver_http.tenant_delete(tenant_id=tenant_id)
|
||||
else:
|
||||
interval = 0.5
|
||||
|
||||
def delete_request_sent():
|
||||
try:
|
||||
pageserver_http.tenant_delete(tenant_id=tenant_id)
|
||||
except PageserverApiException as e:
|
||||
log.debug(e)
|
||||
if e.status_code == 404:
|
||||
return
|
||||
except Exception as e:
|
||||
log.debug(e)
|
||||
|
||||
wait_until(iterations, interval=interval, func=delete_request_sent)
|
||||
wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations)
|
||||
|
||||
|
||||
MANY_SMALL_LAYERS_TENANT_CONFIG = {
|
||||
"gc_period": "0s",
|
||||
"compaction_period": "0s",
|
||||
|
||||
@@ -85,8 +85,6 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
|
||||
f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}",
|
||||
n_tenants,
|
||||
setup_wrapper,
|
||||
# https://github.com/neondatabase/neon/issues/8070
|
||||
timeout_in_seconds=60,
|
||||
)
|
||||
|
||||
env.pageserver.allowed_errors.append(
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
Utilities used by all code in this sub-directory
|
||||
"""
|
||||
|
||||
from typing import Any, Callable, Dict, Optional, Tuple
|
||||
from typing import Any, Callable, Dict, Tuple
|
||||
|
||||
import fixtures.pageserver.many_tenants as many_tenants
|
||||
from fixtures.common_types import TenantId, TimelineId
|
||||
@@ -41,7 +41,6 @@ def setup_pageserver_with_tenants(
|
||||
name: str,
|
||||
n_tenants: int,
|
||||
setup: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]],
|
||||
timeout_in_seconds: Optional[int] = None,
|
||||
) -> NeonEnv:
|
||||
"""
|
||||
Utility function to set up a pageserver with a given number of identical tenants.
|
||||
@@ -51,6 +50,6 @@ def setup_pageserver_with_tenants(
|
||||
return many_tenants.single_timeline(neon_env_builder, setup, n_tenants)
|
||||
|
||||
env = neon_env_builder.build_and_use_snapshot(name, doit)
|
||||
env.start(timeout_in_seconds=timeout_in_seconds)
|
||||
env.start()
|
||||
ensure_pageserver_ready_for_benchmarking(env, n_tenants)
|
||||
return env
|
||||
|
||||
@@ -4,6 +4,7 @@ import pytest
|
||||
from fixtures.benchmark_fixture import MetricReport
|
||||
from fixtures.common_types import Lsn
|
||||
from fixtures.compare_fixtures import NeonCompare, PgCompare
|
||||
from fixtures.pageserver.utils import wait_tenant_status_404
|
||||
from fixtures.pg_version import PgVersion
|
||||
|
||||
|
||||
@@ -67,6 +68,7 @@ def measure_recovery_time(env: NeonCompare):
|
||||
(attach_gen, _) = attach_status
|
||||
|
||||
client.tenant_delete(env.tenant)
|
||||
wait_tenant_status_404(client, env.tenant, iterations=60, interval=0.5)
|
||||
env.env.pageserver.tenant_create(tenant_id=env.tenant, generation=attach_gen)
|
||||
|
||||
# Measure recovery time
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import os
|
||||
import queue
|
||||
import random
|
||||
import threading
|
||||
import time
|
||||
@@ -9,7 +8,11 @@ from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder
|
||||
from fixtures.utils import query_scalar
|
||||
|
||||
|
||||
def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
|
||||
def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: str):
|
||||
if build_type == "debug":
|
||||
# Disable vectored read path cross validation since it makes the test time out.
|
||||
neon_env_builder.pageserver_config_override = "validate_vectored_get=false"
|
||||
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
cache_dir = os.path.join(env.repo_dir, "file_cache")
|
||||
@@ -30,10 +33,11 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
cur = endpoint.connect().cursor()
|
||||
|
||||
stop = threading.Event()
|
||||
n_rows = 100000
|
||||
n_threads = 20
|
||||
n_updates_per_thread = 10000
|
||||
n_updates_per_connection = 1000
|
||||
n_total_updates = n_threads * n_updates_per_thread
|
||||
|
||||
cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)")
|
||||
cur.execute(f"INSERT INTO lfctest SELECT g, 1 FROM generate_series(1, {n_rows}) g")
|
||||
@@ -44,11 +48,11 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
|
||||
# performed (plus the initial 1 on each row).
|
||||
#
|
||||
# Furthermore, each thread will reconnect between every 1000 updates.
|
||||
def run_updates(n_updates_performed_q: queue.Queue[int]):
|
||||
def run_updates():
|
||||
n_updates_performed = 0
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
while not stop.is_set():
|
||||
for _ in range(n_updates_per_thread):
|
||||
id = random.randint(1, n_rows)
|
||||
cur.execute(f"UPDATE lfctest SET n = n + 1 WHERE id = {id}")
|
||||
n_updates_performed += 1
|
||||
@@ -57,28 +61,19 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
|
||||
conn.close()
|
||||
conn = endpoint.connect()
|
||||
cur = conn.cursor()
|
||||
n_updates_performed_q.put(n_updates_performed)
|
||||
|
||||
n_updates_performed_q: queue.Queue[int] = queue.Queue()
|
||||
threads: List[threading.Thread] = []
|
||||
for _i in range(n_threads):
|
||||
thread = threading.Thread(target=run_updates, args=(n_updates_performed_q,), daemon=True)
|
||||
thread = threading.Thread(target=run_updates, args=(), daemon=True)
|
||||
thread.start()
|
||||
threads.append(thread)
|
||||
|
||||
time.sleep(5)
|
||||
|
||||
# unlink, this is what we're actually testing
|
||||
new_cache_dir = os.path.join(env.repo_dir, "file_cache_new")
|
||||
os.rename(cache_dir, new_cache_dir)
|
||||
|
||||
time.sleep(10)
|
||||
|
||||
stop.set()
|
||||
|
||||
n_updates_performed = 0
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
n_updates_performed += n_updates_performed_q.get()
|
||||
|
||||
assert query_scalar(cur, "SELECT SUM(n) FROM lfctest") == n_rows + n_updates_performed
|
||||
assert query_scalar(cur, "SELECT SUM(n) FROM lfctest") == n_total_updates + n_rows
|
||||
|
||||
@@ -11,6 +11,8 @@ from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, StorageScrubb
|
||||
from fixtures.pageserver.common_types import parse_layer_file_name
|
||||
from fixtures.pageserver.utils import (
|
||||
assert_prefix_empty,
|
||||
poll_for_remote_storage_iterations,
|
||||
tenant_delete_wait_completed,
|
||||
wait_for_upload_queue_empty,
|
||||
)
|
||||
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage
|
||||
@@ -361,7 +363,8 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
|
||||
|
||||
# Check that deletion works properly on a tenant that was live-migrated
|
||||
# (reproduce https://github.com/neondatabase/neon/issues/6802)
|
||||
pageserver_b.http_client().tenant_delete(tenant_id)
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
tenant_delete_wait_completed(pageserver_b.http_client(), tenant_id, iterations)
|
||||
|
||||
|
||||
def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
|
||||
@@ -549,7 +552,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
|
||||
log.info("Deleting tenant...")
|
||||
ps_attached.http_client().tenant_delete(tenant_id)
|
||||
tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10)
|
||||
|
||||
assert_prefix_empty(
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
|
||||
@@ -23,11 +23,11 @@ if TYPE_CHECKING:
|
||||
|
||||
# Run the main PostgreSQL regression tests, in src/test/regress.
|
||||
#
|
||||
@pytest.mark.timeout(600)
|
||||
@pytest.mark.parametrize("shard_count", [None, 4])
|
||||
def test_pg_regress(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
test_output_dir: Path,
|
||||
build_type: str,
|
||||
pg_bin: PgBin,
|
||||
capsys: CaptureFixture[str],
|
||||
base_dir: Path,
|
||||
@@ -43,6 +43,10 @@ def test_pg_regress(
|
||||
if shard_count is not None:
|
||||
neon_env_builder.num_pageservers = shard_count
|
||||
|
||||
if build_type == "debug":
|
||||
# Disable vectored read path cross validation since it makes the test time out.
|
||||
neon_env_builder.pageserver_config_override = "validate_vectored_get=false"
|
||||
|
||||
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
|
||||
neon_env_builder.enable_scrub_on_exit()
|
||||
env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
|
||||
|
||||
@@ -6,6 +6,7 @@ from fixtures.neon_fixtures import NeonEnv
|
||||
|
||||
def test_physical_replication(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
n_records = 100000
|
||||
with env.endpoints.create_start(
|
||||
branch_name="main",
|
||||
endpoint_id="primary",
|
||||
@@ -21,20 +22,8 @@ def test_physical_replication(neon_simple_env: NeonEnv):
|
||||
with p_con.cursor() as p_cur:
|
||||
with secondary.connect() as s_con:
|
||||
with s_con.cursor() as s_cur:
|
||||
runtime_secs = 30
|
||||
started_at = time.time()
|
||||
pk = 0
|
||||
while True:
|
||||
pk += 1
|
||||
now = time.time()
|
||||
if now - started_at > runtime_secs:
|
||||
break
|
||||
for pk in range(n_records):
|
||||
p_cur.execute("insert into t (pk) values (%s)", (pk,))
|
||||
# an earlier version of this test was based on a fixed number of loop iterations
|
||||
# and selected for pk=(random.randrange(1, fixed number of loop iterations)).
|
||||
# => the probability of selection for a value that was never inserted changed from 99.9999% to 0% over the course of the test.
|
||||
#
|
||||
# We changed the test to where=(random.randrange(1, 2*pk)), which means the probability is now fixed to 50%.
|
||||
s_cur.execute(
|
||||
"select * from t where pk=%s", (random.randrange(1, 2 * pk),)
|
||||
"select * from t where pk=%s", (random.randrange(1, n_records),)
|
||||
)
|
||||
|
||||
@@ -11,6 +11,8 @@ from fixtures.pageserver.utils import (
|
||||
MANY_SMALL_LAYERS_TENANT_CONFIG,
|
||||
assert_prefix_empty,
|
||||
enable_remote_storage_versioning,
|
||||
poll_for_remote_storage_iterations,
|
||||
tenant_delete_wait_completed,
|
||||
wait_for_upload,
|
||||
)
|
||||
from fixtures.remote_storage import RemoteStorageKind, s3_storage
|
||||
@@ -81,7 +83,8 @@ def test_tenant_s3_restore(
|
||||
assert (
|
||||
ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
|
||||
), "tenant removed before we deletion was issued"
|
||||
ps_http.tenant_delete(tenant_id)
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
|
||||
ps_http.deletion_queue_flush(execute=True)
|
||||
assert (
|
||||
ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
|
||||
|
||||
@@ -24,6 +24,7 @@ from fixtures.pageserver.utils import (
|
||||
enable_remote_storage_versioning,
|
||||
list_prefix,
|
||||
remote_storage_delete_key,
|
||||
tenant_delete_wait_completed,
|
||||
timeline_delete_wait_completed,
|
||||
)
|
||||
from fixtures.pg_version import PgVersion
|
||||
@@ -157,7 +158,7 @@ def test_storage_controller_smoke(
|
||||
|
||||
# Delete all the tenants
|
||||
for tid in tenant_ids:
|
||||
env.storage_controller.pageserver_api().tenant_delete(tid)
|
||||
tenant_delete_wait_completed(env.storage_controller.pageserver_api(), tid, 10)
|
||||
|
||||
env.storage_controller.consistency_check()
|
||||
|
||||
@@ -1383,8 +1384,7 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
|
||||
tenant_id = env.initial_tenant
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
".*Exclusive lock by.*",
|
||||
".*Shared lock by.*",
|
||||
".*Lock on.*",
|
||||
".*Scheduling is disabled by policy.*",
|
||||
f".*Operation TimelineCreate on key {tenant_id} has waited.*",
|
||||
]
|
||||
@@ -1416,23 +1416,9 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
thread_update_tenant_policy.join()
|
||||
|
||||
env.storage_controller.assert_log_contains("Exclusive lock by UpdatePolicy was held for")
|
||||
_, last_log_cursor = env.storage_controller.assert_log_contains(
|
||||
f"Operation TimelineCreate on key {tenant_id} has waited"
|
||||
)
|
||||
|
||||
# Test out shared lock
|
||||
env.storage_controller.configure_failpoints(
|
||||
("tenant-create-timeline-shared-lock", "return(31000)")
|
||||
)
|
||||
|
||||
timeline_id = TimelineId.generate()
|
||||
# This will hold the shared lock for enough time to cause an warning
|
||||
env.storage_controller.pageserver_api().timeline_create(
|
||||
pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id
|
||||
)
|
||||
env.storage_controller.assert_log_contains("Lock on UpdatePolicy was held for")
|
||||
env.storage_controller.assert_log_contains(
|
||||
"Shared lock by TimelineCreate was held for", offset=last_log_cursor
|
||||
f"Operation TimelineCreate on key {tenant_id} has waited"
|
||||
)
|
||||
|
||||
|
||||
@@ -1541,7 +1527,13 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
|
||||
)
|
||||
|
||||
# Give things a chance to settle.
|
||||
env.storage_controller.reconcile_until_idle(timeout_secs=30)
|
||||
# A call to `reconcile_until_idle` could be used here instead,
|
||||
# however since all attachments are placed on the same node,
|
||||
# we'd have to wait for a long time (2 minutes-ish) for optimizations
|
||||
# to quiesce.
|
||||
# TODO: once the initial attachment selection is fixed, update this
|
||||
# to use `reconcile_until_idle`.
|
||||
time.sleep(2)
|
||||
|
||||
nodes = env.storage_controller.node_list()
|
||||
assert len(nodes) == 2
|
||||
|
||||
@@ -1,11 +1,17 @@
|
||||
import concurrent.futures
|
||||
import enum
|
||||
import os
|
||||
import shutil
|
||||
from threading import Thread
|
||||
|
||||
import pytest
|
||||
from fixtures.common_types import Lsn, TenantId, TimelineId
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.neon_fixtures import (
|
||||
NeonEnvBuilder,
|
||||
PgBin,
|
||||
StorageScrubber,
|
||||
last_flush_lsn_upload,
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException
|
||||
@@ -13,33 +19,18 @@ from fixtures.pageserver.utils import (
|
||||
MANY_SMALL_LAYERS_TENANT_CONFIG,
|
||||
assert_prefix_empty,
|
||||
assert_prefix_not_empty,
|
||||
poll_for_remote_storage_iterations,
|
||||
tenant_delete_wait_completed,
|
||||
wait_for_upload,
|
||||
wait_tenant_status_404,
|
||||
wait_until_tenant_active,
|
||||
wait_until_tenant_state,
|
||||
)
|
||||
from fixtures.remote_storage import RemoteStorageKind, s3_storage
|
||||
from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage
|
||||
from fixtures.utils import run_pg_bench_small, wait_until
|
||||
from requests.exceptions import ReadTimeout
|
||||
|
||||
|
||||
def error_tolerant_delete(ps_http, tenant_id):
|
||||
"""
|
||||
For tests that inject 500 errors, we must retry repeatedly when issuing deletions
|
||||
"""
|
||||
while True:
|
||||
try:
|
||||
ps_http.tenant_delete(tenant_id=tenant_id)
|
||||
except PageserverApiException as e:
|
||||
if e.status_code == 500:
|
||||
# This test uses failure injection, which can produce 500s as the pageserver expects
|
||||
# the object store to always be available, and the ListObjects during deletion is generally
|
||||
# an infallible operation
|
||||
assert "simulated failure of remote operation" in e.message
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
# Success, drop out
|
||||
break
|
||||
|
||||
|
||||
def test_tenant_delete_smoke(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
pg_bin: PgBin,
|
||||
@@ -68,7 +59,21 @@ def test_tenant_delete_smoke(
|
||||
|
||||
# Check that deleting a non-existent tenant gives the expected result: this is a loop because we
|
||||
# may need to retry on some remote storage errors injected by the test harness
|
||||
error_tolerant_delete(ps_http, tenant_id)
|
||||
while True:
|
||||
try:
|
||||
ps_http.tenant_delete(tenant_id=tenant_id)
|
||||
except PageserverApiException as e:
|
||||
if e.status_code == 500:
|
||||
# This test uses failure injection, which can produce 500s as the pageserver expects
|
||||
# the object store to always be available, and the ListObjects during deletion is generally
|
||||
# an infallible operation
|
||||
assert "simulated failure of remote operation" in e.message
|
||||
elif e.status_code == 404:
|
||||
# This is our expected result: trying to erase a non-existent tenant gives us 404
|
||||
assert "NotFound" in e.message
|
||||
break
|
||||
else:
|
||||
raise
|
||||
|
||||
env.neon_cli.create_tenant(
|
||||
tenant_id=tenant_id,
|
||||
@@ -103,8 +108,10 @@ def test_tenant_delete_smoke(
|
||||
# Upload a heatmap so that we exercise deletion of that too
|
||||
ps_http.tenant_heatmap_upload(tenant_id)
|
||||
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2
|
||||
error_tolerant_delete(ps_http, tenant_id)
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
|
||||
|
||||
tenant_path = env.pageserver.tenant_dir(tenant_id)
|
||||
@@ -122,7 +129,286 @@ def test_tenant_delete_smoke(
|
||||
|
||||
# Deletion updates the tenant count: the one default tenant remains
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0
|
||||
|
||||
|
||||
class Check(enum.Enum):
|
||||
RETRY_WITHOUT_RESTART = enum.auto()
|
||||
RETRY_WITH_RESTART = enum.auto()
|
||||
|
||||
|
||||
FAILPOINTS = [
|
||||
"tenant-delete-before-shutdown",
|
||||
"tenant-delete-before-create-remote-mark",
|
||||
"tenant-delete-before-create-local-mark",
|
||||
"tenant-delete-before-background",
|
||||
"tenant-delete-before-polling-ongoing-deletions",
|
||||
"tenant-delete-before-cleanup-remaining-fs-traces",
|
||||
"tenant-delete-before-remove-timelines-dir",
|
||||
"tenant-delete-before-remove-deleted-mark",
|
||||
"tenant-delete-before-remove-tenant-dir",
|
||||
# Some failpoints from timeline deletion
|
||||
"timeline-delete-before-index-deleted-at",
|
||||
"timeline-delete-before-rm",
|
||||
"timeline-delete-before-index-delete",
|
||||
]
|
||||
|
||||
FAILPOINTS_BEFORE_BACKGROUND = [
|
||||
"timeline-delete-before-schedule",
|
||||
"tenant-delete-before-shutdown",
|
||||
"tenant-delete-before-create-remote-mark",
|
||||
"tenant-delete-before-create-local-mark",
|
||||
"tenant-delete-before-background",
|
||||
]
|
||||
|
||||
|
||||
def combinations():
|
||||
result = []
|
||||
|
||||
remotes = available_s3_storages()
|
||||
|
||||
for remote_storage_kind in remotes:
|
||||
for delete_failpoint in FAILPOINTS:
|
||||
# Simulate failures for only one type of remote storage
|
||||
# to avoid log pollution and make tests run faster
|
||||
if remote_storage_kind is RemoteStorageKind.MOCK_S3:
|
||||
simulate_failures = True
|
||||
else:
|
||||
simulate_failures = False
|
||||
result.append((remote_storage_kind, delete_failpoint, simulate_failures))
|
||||
return result
|
||||
|
||||
|
||||
@pytest.mark.parametrize("check", list(Check))
|
||||
@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations())
|
||||
def test_delete_tenant_exercise_crash_safety_failpoints(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
remote_storage_kind: RemoteStorageKind,
|
||||
failpoint: str,
|
||||
simulate_failures: bool,
|
||||
check: Check,
|
||||
pg_bin: PgBin,
|
||||
):
|
||||
if simulate_failures:
|
||||
neon_env_builder.pageserver_config_override = "test_remote_failures=1"
|
||||
|
||||
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
# From deletion polling
|
||||
f".*NotFound: tenant {env.initial_tenant}.*",
|
||||
# allow errors caused by failpoints
|
||||
f".*failpoint: {failpoint}",
|
||||
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
|
||||
".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
# We may leave some upload tasks in the queue. They're likely deletes.
|
||||
# For uploads we explicitly wait with `last_flush_lsn_upload` below.
|
||||
# So by ignoring these instead of waiting for empty upload queue
|
||||
# we execute more distinct code paths.
|
||||
'.*stopping left-over name="remote upload".*',
|
||||
# an on-demand is cancelled by shutdown
|
||||
".*initial size calculation failed: downloading failed, possibly for shutdown",
|
||||
]
|
||||
)
|
||||
|
||||
if simulate_failures:
|
||||
env.pageserver.allowed_errors.append(
|
||||
# The deletion queue will complain when it encounters simulated S3 errors
|
||||
".*deletion executor: DeleteObjects request failed.*",
|
||||
)
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id)
|
||||
with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint:
|
||||
# generate enough layers
|
||||
run_pg_bench_small(pg_bin, endpoint.connstr())
|
||||
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(tenant_id),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
ps_http.configure_failpoints((failpoint, "return"))
|
||||
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
|
||||
# These failpoints are earlier than background task is spawned.
|
||||
# so they result in api request failure.
|
||||
if failpoint in FAILPOINTS_BEFORE_BACKGROUND:
|
||||
with pytest.raises(PageserverApiException, match=failpoint):
|
||||
ps_http.tenant_delete(tenant_id)
|
||||
|
||||
else:
|
||||
ps_http.tenant_delete(tenant_id)
|
||||
tenant_info = wait_until_tenant_state(
|
||||
pageserver_http=ps_http,
|
||||
tenant_id=tenant_id,
|
||||
expected_state="Broken",
|
||||
iterations=iterations,
|
||||
)
|
||||
|
||||
reason = tenant_info["state"]["data"]["reason"]
|
||||
log.info(f"tenant broken: {reason}")
|
||||
|
||||
# failpoint may not be the only error in the stack
|
||||
assert reason.endswith(f"failpoint: {failpoint}"), reason
|
||||
|
||||
if check is Check.RETRY_WITH_RESTART:
|
||||
env.pageserver.restart()
|
||||
|
||||
if failpoint in (
|
||||
"tenant-delete-before-shutdown",
|
||||
"tenant-delete-before-create-remote-mark",
|
||||
):
|
||||
wait_until_tenant_active(
|
||||
ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25
|
||||
)
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
|
||||
else:
|
||||
# Pageserver should've resumed deletion after restart.
|
||||
wait_tenant_status_404(ps_http, tenant_id, iterations=iterations + 10)
|
||||
elif check is Check.RETRY_WITHOUT_RESTART:
|
||||
# this should succeed
|
||||
# this also checks that delete can be retried even when tenant is in Broken state
|
||||
ps_http.configure_failpoints((failpoint, "off"))
|
||||
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
|
||||
|
||||
tenant_dir = env.pageserver.tenant_dir(tenant_id)
|
||||
# Check local is empty
|
||||
assert not tenant_dir.exists()
|
||||
|
||||
# Check remote is empty
|
||||
assert_prefix_empty(
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(tenant_id),
|
||||
)
|
||||
),
|
||||
allowed_postfix="initdb.tar.zst",
|
||||
)
|
||||
|
||||
|
||||
def test_tenant_delete_is_resumed_on_attach(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
pg_bin: PgBin,
|
||||
):
|
||||
remote_storage_kind = s3_storage()
|
||||
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
|
||||
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
|
||||
env.pageserver.allowed_errors.append(
|
||||
# lucky race with stopping from flushing a layer we fail to schedule any uploads
|
||||
".*layer flush task.+: could not flush frozen layer: update_metadata_file"
|
||||
)
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
# create two timelines
|
||||
for timeline in ["first", "second"]:
|
||||
timeline_id = env.neon_cli.create_timeline(timeline, tenant_id=tenant_id)
|
||||
with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
|
||||
run_pg_bench_small(pg_bin, endpoint.connstr())
|
||||
wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
|
||||
|
||||
# sanity check, data should be there
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(tenant_id),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
# failpoint before we remove index_part from s3
|
||||
failpoint = "timeline-delete-before-index-delete"
|
||||
ps_http.configure_failpoints((failpoint, "return"))
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
(
|
||||
# allow errors caused by failpoints
|
||||
f".*failpoint: {failpoint}",
|
||||
# From deletion polling
|
||||
f".*NotFound: tenant {env.initial_tenant}.*",
|
||||
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
|
||||
".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
|
||||
# error from http response is also logged
|
||||
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
|
||||
'.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
|
||||
)
|
||||
)
|
||||
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
|
||||
ps_http.tenant_delete(tenant_id)
|
||||
|
||||
tenant_info = wait_until_tenant_state(
|
||||
pageserver_http=ps_http,
|
||||
tenant_id=tenant_id,
|
||||
expected_state="Broken",
|
||||
iterations=iterations,
|
||||
)
|
||||
|
||||
assert_prefix_not_empty(
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(tenant_id),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
reason = tenant_info["state"]["data"]["reason"]
|
||||
# failpoint may not be the only error in the stack
|
||||
assert reason.endswith(f"failpoint: {failpoint}"), reason
|
||||
|
||||
# now we stop pageserver and remove local tenant state
|
||||
env.endpoints.stop_all()
|
||||
env.pageserver.stop()
|
||||
|
||||
dir_to_clear = env.pageserver.tenant_dir()
|
||||
shutil.rmtree(dir_to_clear)
|
||||
os.mkdir(dir_to_clear)
|
||||
|
||||
env.pageserver.start()
|
||||
|
||||
# now we call attach
|
||||
env.pageserver.tenant_attach(tenant_id=tenant_id)
|
||||
|
||||
# delete should be resumed
|
||||
wait_tenant_status_404(ps_http, tenant_id, iterations)
|
||||
|
||||
# we shouldn've created tenant dir on disk
|
||||
tenant_path = env.pageserver.tenant_dir(tenant_id)
|
||||
assert not tenant_path.exists()
|
||||
|
||||
ps_http.deletion_queue_flush(execute=True)
|
||||
assert_prefix_empty(
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(tenant_id),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder):
|
||||
@@ -197,6 +483,105 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
|
||||
deletion.join()
|
||||
|
||||
|
||||
def test_tenant_delete_concurrent(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
pg_bin: PgBin,
|
||||
):
|
||||
"""
|
||||
Validate that concurrent delete requests to the same tenant behave correctly:
|
||||
exactly one should execute: the rest should give 202 responses but not start
|
||||
another deletion.
|
||||
|
||||
This is a reproducer for https://github.com/neondatabase/neon/issues/5936
|
||||
"""
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
|
||||
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
|
||||
ps_http = env.pageserver.http_client()
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
# Populate some data
|
||||
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
|
||||
run_pg_bench_small(pg_bin, endpoint.connstr())
|
||||
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
# lucky race with stopping from flushing a layer we fail to schedule any uploads
|
||||
".*layer flush task.+: could not flush frozen layer: update_metadata_file",
|
||||
]
|
||||
)
|
||||
|
||||
BEFORE_REMOVE_FAILPOINT = "tenant-delete-before-map-remove"
|
||||
BEFORE_RUN_FAILPOINT = "tenant-delete-before-run"
|
||||
|
||||
# We will let the initial delete run until right before it would remove
|
||||
# the tenant's TenantSlot. This pauses it in a state where the tenant
|
||||
# is visible in Stopping state, and concurrent requests should fail with 4xx.
|
||||
ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "pause"))
|
||||
|
||||
def delete_tenant():
|
||||
return ps_http.tenant_delete(tenant_id)
|
||||
|
||||
def hit_remove_failpoint():
|
||||
return env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")[1]
|
||||
|
||||
def hit_run_failpoint():
|
||||
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
background_200_req = executor.submit(delete_tenant)
|
||||
assert background_200_req.result(timeout=10).status_code == 202
|
||||
|
||||
# Wait until the first request completes its work and is blocked on removing
|
||||
# the TenantSlot from tenant manager.
|
||||
log_cursor = wait_until(100, 0.1, hit_remove_failpoint)
|
||||
assert log_cursor is not None
|
||||
|
||||
# Start another request: this should succeed without actually entering the deletion code
|
||||
ps_http.tenant_delete(tenant_id)
|
||||
assert not env.pageserver.log_contains(
|
||||
f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
|
||||
)
|
||||
|
||||
# Start another background request, which will pause after acquiring a TenantSlotGuard
|
||||
# but before completing.
|
||||
ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "pause"))
|
||||
background_4xx_req = executor.submit(delete_tenant)
|
||||
wait_until(100, 0.1, hit_run_failpoint)
|
||||
|
||||
# The TenantSlot is still present while the original request is hung before
|
||||
# final removal
|
||||
assert (
|
||||
ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
|
||||
)
|
||||
|
||||
# Permit the original request to run to success
|
||||
ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "off"))
|
||||
|
||||
# Permit the duplicate background request to run to completion and fail.
|
||||
ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off"))
|
||||
background_4xx_req.result(timeout=10)
|
||||
assert not env.pageserver.log_contains(
|
||||
f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
|
||||
)
|
||||
|
||||
# Physical deletion should have happened
|
||||
assert_prefix_empty(
|
||||
neon_env_builder.pageserver_remote_storage,
|
||||
prefix="/".join(
|
||||
(
|
||||
"tenants",
|
||||
str(tenant_id),
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
# Zero tenants remain (we deleted the default tenant)
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
|
||||
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0
|
||||
|
||||
|
||||
def test_tenant_delete_races_timeline_creation(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
pg_bin: PgBin,
|
||||
@@ -289,7 +674,9 @@ def test_tenant_delete_races_timeline_creation(
|
||||
# Disable the failpoint and wait for deletion to finish
|
||||
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
|
||||
|
||||
ps_http.tenant_delete(tenant_id)
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations, ignore_errors=True)
|
||||
|
||||
# Physical deletion should have happened
|
||||
assert_prefix_empty(
|
||||
@@ -340,7 +727,8 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
|
||||
|
||||
env.start()
|
||||
ps_http = env.pageserver.http_client()
|
||||
ps_http.tenant_delete(tenant_id)
|
||||
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
|
||||
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
|
||||
env.stop()
|
||||
|
||||
scrubber.scan_metadata()
|
||||
|
||||
@@ -344,6 +344,56 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
|
||||
pageserver_http.timeline_gc(tenant_id, timeline_id, 0)
|
||||
|
||||
|
||||
# Creates and ignores a tenant, then detaches it: first, with no parameters (should fail),
|
||||
# then with parameters to force ignored tenant detach (should not fail).
|
||||
def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
client = env.pageserver.http_client()
|
||||
|
||||
# create a new tenant
|
||||
tenant_id, _ = env.neon_cli.create_tenant()
|
||||
|
||||
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
|
||||
|
||||
# assert tenant exists on disk
|
||||
assert env.pageserver.tenant_dir(tenant_id).exists()
|
||||
|
||||
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
|
||||
# we rely upon autocommit after each statement
|
||||
endpoint.safe_psql_many(
|
||||
queries=[
|
||||
"CREATE TABLE t(key int primary key, value text)",
|
||||
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
|
||||
]
|
||||
)
|
||||
|
||||
# ignore tenant
|
||||
client.tenant_ignore(tenant_id)
|
||||
env.pageserver.allowed_errors.append(".*NotFound: tenant .*")
|
||||
# ensure tenant couldn't be detached without the special flag for ignored tenant
|
||||
log.info("detaching ignored tenant WITHOUT required flag")
|
||||
with pytest.raises(
|
||||
expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}"
|
||||
):
|
||||
client.tenant_detach(tenant_id)
|
||||
|
||||
log.info("tenant detached failed as expected")
|
||||
|
||||
# ensure tenant is detached with ignore state
|
||||
log.info("detaching ignored tenant with required flag")
|
||||
client.tenant_detach(tenant_id, True)
|
||||
log.info("ignored tenant detached without error")
|
||||
|
||||
# check that nothing is left on disk for deleted tenant
|
||||
assert not env.pageserver.tenant_dir(tenant_id).exists()
|
||||
|
||||
# assert the tenant does not exists in the Pageserver
|
||||
tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
|
||||
assert (
|
||||
tenant_id not in tenants_after_detach
|
||||
), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory"
|
||||
|
||||
|
||||
# Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach.
|
||||
# Tenant should be detached without issues.
|
||||
def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
|
||||
@@ -450,6 +500,153 @@ def test_detach_while_attaching(
|
||||
cur.execute("SELECT COUNT(*) FROM foo")
|
||||
|
||||
|
||||
# Tests that `ignore` and `get` operations' combination is able to remove and restore the tenant in pageserver's memory.
|
||||
# * writes some data into tenant's timeline
|
||||
# * ensures it's synced with the remote storage
|
||||
# * `ignore` the tenant
|
||||
# * verify that ignored tenant files are generally unchanged, only an ignored mark had appeared
|
||||
# * verify the ignored tenant is gone from pageserver's memory
|
||||
# * restart the pageserver and verify that ignored tenant is still not loaded
|
||||
# * `load` the same tenant
|
||||
# * ensure that it's status is `Active` and it's present in pageserver's memory with all timelines
|
||||
def test_ignored_tenant_reattach(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
|
||||
ignored_tenant_id, _ = env.neon_cli.create_tenant()
|
||||
tenant_dir = env.pageserver.tenant_dir(ignored_tenant_id)
|
||||
tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
tenants_before_ignore.sort()
|
||||
timelines_before_ignore = [
|
||||
timeline["timeline_id"]
|
||||
for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
|
||||
]
|
||||
files_before_ignore = [tenant_path for tenant_path in tenant_dir.glob("**/*")]
|
||||
|
||||
# ignore the tenant and veirfy it's not present in pageserver replies, with its files still on disk
|
||||
pageserver_http.tenant_ignore(ignored_tenant_id)
|
||||
|
||||
files_after_ignore_with_retain = [tenant_path for tenant_path in tenant_dir.glob("**/*")]
|
||||
new_files = set(files_after_ignore_with_retain) - set(files_before_ignore)
|
||||
disappeared_files = set(files_before_ignore) - set(files_after_ignore_with_retain)
|
||||
assert (
|
||||
len(disappeared_files) == 0
|
||||
), f"Tenant ignore should not remove files from disk, missing: {disappeared_files}"
|
||||
assert (
|
||||
len(new_files) == 1
|
||||
), f"Only tenant ignore file should appear on disk but got: {new_files}"
|
||||
|
||||
tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
assert ignored_tenant_id not in tenants_after_ignore, "Ignored tenant should be missing"
|
||||
assert len(tenants_after_ignore) + 1 == len(
|
||||
tenants_before_ignore
|
||||
), "Only ignored tenant should be missing"
|
||||
|
||||
# restart the pageserver to ensure we don't load the ignore timeline
|
||||
env.pageserver.stop()
|
||||
env.pageserver.start()
|
||||
tenants_after_restart = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
tenants_after_restart.sort()
|
||||
assert (
|
||||
tenants_after_restart == tenants_after_ignore
|
||||
), "Ignored tenant should not be reloaded after pageserver restart"
|
||||
|
||||
# now, load it from the local files and expect it works
|
||||
env.pageserver.tenant_load(tenant_id=ignored_tenant_id)
|
||||
wait_until_tenant_state(pageserver_http, ignored_tenant_id, "Active", 5)
|
||||
|
||||
tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
tenants_after_attach.sort()
|
||||
assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"
|
||||
|
||||
timelines_after_ignore = [
|
||||
timeline["timeline_id"]
|
||||
for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
|
||||
]
|
||||
assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"
|
||||
|
||||
|
||||
# Tests that it's possible to `load` tenants with missing layers and get them restored:
|
||||
# * writes some data into tenant's timeline
|
||||
# * ensures it's synced with the remote storage
|
||||
# * `ignore` the tenant
|
||||
# * removes all timeline's local layers
|
||||
# * `load` the same tenant
|
||||
# * ensure that it's status is `Active`
|
||||
# * check that timeline data is restored
|
||||
def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
endpoint = env.endpoints.create_start("main")
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
|
||||
|
||||
data_id = 1
|
||||
data_secret = "very secret secret"
|
||||
insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
|
||||
|
||||
tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
tenants_before_ignore.sort()
|
||||
timelines_before_ignore = [
|
||||
timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
|
||||
]
|
||||
|
||||
# ignore the tenant and remove its layers
|
||||
pageserver_http.tenant_ignore(tenant_id)
|
||||
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
|
||||
layers_removed = False
|
||||
for dir_entry in timeline_dir.iterdir():
|
||||
if dir_entry.name.startswith("00000"):
|
||||
# Looks like a layer file. Remove it
|
||||
dir_entry.unlink()
|
||||
layers_removed = True
|
||||
assert layers_removed, f"Found no layers for tenant {timeline_dir}"
|
||||
|
||||
# now, load it from the local files and expect it to work due to remote storage restoration
|
||||
env.pageserver.tenant_load(tenant_id=tenant_id)
|
||||
wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
|
||||
|
||||
tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
|
||||
tenants_after_attach.sort()
|
||||
assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"
|
||||
|
||||
timelines_after_ignore = [
|
||||
timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
|
||||
]
|
||||
assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"
|
||||
|
||||
endpoint.stop()
|
||||
endpoint.start()
|
||||
ensure_test_data(data_id, data_secret, endpoint)
|
||||
|
||||
|
||||
# Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally
|
||||
# Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored.
|
||||
def test_load_negatives(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
|
||||
env = neon_env_builder.init_start()
|
||||
pageserver_http = env.pageserver.http_client()
|
||||
env.endpoints.create_start("main")
|
||||
|
||||
tenant_id = env.initial_tenant
|
||||
|
||||
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
|
||||
|
||||
env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
|
||||
with pytest.raises(
|
||||
expected_exception=PageserverApiException,
|
||||
match=f"tenant {tenant_id} already exists, state: Active",
|
||||
):
|
||||
env.pageserver.tenant_load(tenant_id)
|
||||
|
||||
pageserver_http.tenant_ignore(tenant_id)
|
||||
|
||||
|
||||
def test_detach_while_activating(
|
||||
neon_env_builder: NeonEnvBuilder,
|
||||
):
|
||||
@@ -573,7 +770,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
|
||||
|
||||
wait_until(10, 0.5, found_broken)
|
||||
|
||||
client.tenant_detach(env.initial_tenant)
|
||||
client.tenant_ignore(env.initial_tenant)
|
||||
|
||||
def found_cleaned_up():
|
||||
m = client.get_metrics()
|
||||
@@ -585,7 +782,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
|
||||
|
||||
wait_until(10, 0.5, found_cleaned_up)
|
||||
|
||||
env.pageserver.tenant_attach(env.initial_tenant)
|
||||
env.pageserver.tenant_load(env.initial_tenant)
|
||||
|
||||
def found_active():
|
||||
m = client.get_metrics()
|
||||
|
||||
@@ -15,6 +15,7 @@ from fixtures.pageserver.utils import (
|
||||
assert_tenant_state,
|
||||
wait_for_last_record_lsn,
|
||||
wait_for_upload,
|
||||
wait_tenant_status_404,
|
||||
)
|
||||
from fixtures.remote_storage import (
|
||||
LocalFsStorage,
|
||||
@@ -347,6 +348,9 @@ def test_tenant_relocation(
|
||||
# is no longer involved, and if it is, we will see the error
|
||||
origin_http.tenant_detach(tenant_id)
|
||||
|
||||
# Wait a little, so that the detach operation has time to finish.
|
||||
wait_tenant_status_404(origin_http, tenant_id, iterations=100, interval=1)
|
||||
|
||||
post_migration_check(ep_main, 500500, old_local_path_main)
|
||||
post_migration_check(ep_second, 1001000, old_local_path_second)
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ from fixtures.neon_fixtures import (
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
|
||||
from fixtures.pageserver.utils import (
|
||||
tenant_delete_wait_completed,
|
||||
timeline_delete_wait_completed,
|
||||
wait_until_tenant_active,
|
||||
)
|
||||
@@ -668,7 +669,7 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
|
||||
),
|
||||
)
|
||||
|
||||
client.tenant_delete(env.initial_tenant)
|
||||
tenant_delete_wait_completed(client, env.initial_tenant, 10)
|
||||
|
||||
client.configure_failpoints((failpoint, "off"))
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ from fixtures.neon_fixtures import (
|
||||
wait_for_last_flush_lsn,
|
||||
)
|
||||
from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
|
||||
from fixtures.pageserver.utils import wait_timeline_detail_404
|
||||
from fixtures.pageserver.utils import wait_tenant_status_404, wait_timeline_detail_404
|
||||
from fixtures.remote_storage import LocalFsStorage
|
||||
from fixtures.utils import assert_pageserver_backups_equal
|
||||
|
||||
@@ -578,6 +578,7 @@ def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder):
|
||||
assert info.value.status_code == 400
|
||||
|
||||
client.tenant_delete(env.initial_tenant)
|
||||
wait_tenant_status_404(client, env.initial_tenant, 10, 1)
|
||||
|
||||
with pytest.raises(PageserverApiException) as e:
|
||||
client.detach_ancestor(env.initial_tenant, first_branch)
|
||||
|
||||
@@ -26,6 +26,7 @@ from fixtures.pageserver.utils import (
|
||||
assert_tenant_state,
|
||||
timeline_delete_wait_completed,
|
||||
wait_for_upload_queue_empty,
|
||||
wait_tenant_status_404,
|
||||
wait_until_tenant_active,
|
||||
)
|
||||
from fixtures.pg_version import PgVersion
|
||||
@@ -863,33 +864,39 @@ def delete_lazy_activating(
|
||||
):
|
||||
pageserver_http = pageserver.http_client()
|
||||
|
||||
# Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
|
||||
# logical size is paused in a failpoint. So instead we will use a log observation to check that
|
||||
# on-demand activation was triggered by the tenant deletion
|
||||
log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*"
|
||||
|
||||
if expect_attaching:
|
||||
assert pageserver_http.tenant_status(delete_tenant_id)["state"]["slug"] == "Attaching"
|
||||
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
log.info("Starting background delete")
|
||||
|
||||
def shutting_down():
|
||||
assert pageserver.log_contains(".*Waiting for timelines.*") is not None
|
||||
def activated_on_demand():
|
||||
assert pageserver.log_contains(log_match) is not None
|
||||
|
||||
def delete_tenant():
|
||||
pageserver_http.tenant_delete(delete_tenant_id)
|
||||
|
||||
background_delete = executor.submit(delete_tenant)
|
||||
|
||||
# We expect deletion to enter shutdown of the tenant even though it's in the attaching state
|
||||
log.info(f"Waiting for activation message '{log_match}'")
|
||||
try:
|
||||
# Deletion will get to the point in shutdown where it's waiting for timeline shutdown, then
|
||||
# hang because of our failpoint blocking activation.
|
||||
wait_until(10, 1, shutting_down)
|
||||
wait_until(10, 1, activated_on_demand)
|
||||
finally:
|
||||
log.info("Clearing failpoint")
|
||||
pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
|
||||
|
||||
# Deletion should complete successfully now that failpoint is unblocked and shutdown can complete
|
||||
# Deletion should complete successfully now that failpoint is unblocked
|
||||
log.info("Joining background delete")
|
||||
background_delete.result(timeout=10)
|
||||
|
||||
# Poll for deletion to complete
|
||||
wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40)
|
||||
|
||||
|
||||
def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
|
||||
"""
|
||||
|
||||
@@ -10,7 +10,7 @@ from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
|
||||
# Ensures that walreceiver does not run without any data inserted and only starts after the insertion.
|
||||
def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
|
||||
# Trigger WAL wait timeout faster
|
||||
neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '3s'"
|
||||
neon_env_builder.pageserver_config_override = "wait_lsn_timeout = '1s'"
|
||||
env = neon_env_builder.init_start()
|
||||
env.pageserver.http_client()
|
||||
|
||||
@@ -44,7 +44,7 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
|
||||
def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
|
||||
# Trigger WAL wait timeout faster
|
||||
def customize_pageserver_toml(ps_cfg: Dict[str, Any]):
|
||||
ps_cfg["wait_lsn_timeout"] = "3s"
|
||||
ps_cfg["wait_lsn_timeout"] = "1s"
|
||||
tenant_config = ps_cfg.setdefault("tenant_config", {})
|
||||
tenant_config["walreceiver_connect_timeout"] = "2s"
|
||||
tenant_config["lagging_wal_timeout"] = "2s"
|
||||
|
||||
Reference in New Issue
Block a user