Compare commits

..

1 Commits

Author SHA1 Message Date
Conrad Ludgate
fe7254b3fd update dashmap with new unsafe raw shards api 2024-06-19 09:03:59 +01:00
82 changed files with 2409 additions and 2229 deletions

View File

@@ -183,7 +183,8 @@ runs:
# Run the tests.
#
# --alluredir saves test results in Allure format (in a specified directory)
# The junit.xml file allows CI tools to display more fine-grained test information
# in its "Tests" tab in the results page.
# --verbose prints name of each test (helpful when there are
# multiple tests in one file)
# -rA prints summary in the end
@@ -192,6 +193,7 @@ runs:
#
mkdir -p $TEST_OUTPUT/allure/results
"${cov_prefix[@]}" ./scripts/pytest \
--junitxml=$TEST_OUTPUT/junit.xml \
--alluredir=$TEST_OUTPUT/allure/results \
--tb=short \
--verbose \

View File

@@ -36,16 +36,15 @@ jobs:
fail_on_error: true
filter_mode: nofilter
level: error
- name: Disallow 'ubuntu-latest' runners
run: |
- run: |
PAT='^\s*runs-on:.*-latest'
if grep -ERq $PAT .github/workflows; then
if grep -ERq $PAT .github/workflows
then
grep -ERl $PAT .github/workflows |\
while read -r f
do
l=$(grep -nE $PAT .github/workflows/release.yml | awk -F: '{print $1}' | head -1)
echo "::error file=$f,line=$l::Please use 'ubuntu-22.04' instead of 'ubuntu-latest'"
echo "::error file=$f,line=$l::Please, do not use ubuntu-latest images to run on, use LTS instead."
done
exit 1
fi

View File

@@ -1023,18 +1023,6 @@ jobs:
with:
fetch-depth: 0
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
# The default value is ~/.docker
- name: Set custom docker config directory
run: |
mkdir -p .docker-custom
echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
- uses: docker/login-action@v3
with:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
# `neondatabase/neon` contains multiple binaries, all of them use the same input for the version into the same version formatting library.
# Pick pageserver as currently the only binary with extra "version" features printed in the string to verify.
# Regular pageserver version string looks like
@@ -1069,11 +1057,6 @@ jobs:
docker compose -f ./docker-compose/docker-compose.yml logs || 0
docker compose -f ./docker-compose/docker-compose.yml down
- name: Remove custom docker config directory
if: always()
run: |
rm -rf .docker-custom
promote-images:
needs: [ check-permissions, tag, test-images, vm-compute-node-image ]
runs-on: ubuntu-22.04
@@ -1087,8 +1070,7 @@ jobs:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
- name: Login to dev ECR
uses: docker/login-action@v3
- uses: docker/login-action@v3
with:
registry: 369495373322.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.AWS_ACCESS_KEY_DEV }}
@@ -1122,22 +1104,6 @@ jobs:
docker buildx imagetools create -t neondatabase/neon-test-extensions-v16:latest \
neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
- name: Login to prod ECR
uses: docker/login-action@v3
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
with:
registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com
username: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_ACCESS_KEY_ID }}
password: ${{ secrets.PROD_GHA_RUNNER_LIMITED_AWS_SECRET_ACCESS_KEY }}
- name: Copy all images to prod ECR
if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
run: |
for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16}; do
docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
369495373322.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }}
done
trigger-custom-extensions-build-and-wait:
needs: [ check-permissions, tag ]
runs-on: ubuntu-22.04

View File

@@ -52,15 +52,13 @@ jobs:
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
TITLE="Storage & Compute release ${RELEASE_DATE}"
cat << EOF > body.md
## ${TITLE}
## Storage & Compute release ${RELEASE_DATE}
**Please merge this Pull Request using 'Create a merge commit' button**
EOF
gh pr create --title "${TITLE}" \
gh pr create --title "Release ${RELEASE_DATE}" \
--body-file "body.md" \
--head "${RELEASE_BRANCH}" \
--base "release"
@@ -93,15 +91,13 @@ jobs:
env:
GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
run: |
TITLE="Proxy release ${RELEASE_DATE}"
cat << EOF > body.md
## ${TITLE}
## Proxy release ${RELEASE_DATE}
**Please merge this Pull Request using 'Create a merge commit' button**
EOF
gh pr create --title "${TITLE}" \
gh pr create --title "Proxy release ${RELEASE_DATE}" \
--body-file "body.md" \
--head "${RELEASE_BRANCH}" \
--base "release-proxy"

24
Cargo.lock generated
View File

@@ -1014,9 +1014,6 @@ name = "camino"
version = "1.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c"
dependencies = [
"serde",
]
[[package]]
name = "camino-tempfile"
@@ -1601,6 +1598,20 @@ dependencies = [
"parking_lot_core 0.9.8",
]
[[package]]
name = "dashmap"
version = "6.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23fadfd577acfd4485fb258011b0fd080882ea83359b6fd41304900b94ccf487"
dependencies = [
"cfg-if",
"crossbeam-utils",
"hashbrown 0.14.5",
"lock_api",
"once_cell",
"parking_lot_core 0.9.8",
]
[[package]]
name = "data-encoding"
version = "2.4.0"
@@ -2851,7 +2862,7 @@ version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4644821e1c3d7a560fe13d842d13f587c07348a1a05d3a797152d41c90c56df2"
dependencies = [
"dashmap",
"dashmap 5.5.0",
"hashbrown 0.13.2",
]
@@ -4299,7 +4310,7 @@ dependencies = [
"clap",
"consumption_metrics",
"crossbeam-deque",
"dashmap",
"dashmap 6.0.0",
"env_logger",
"fallible-iterator",
"framed-websockets",
@@ -4650,7 +4661,6 @@ dependencies = [
"futures-util",
"http-types",
"humantime",
"humantime-serde",
"hyper 0.14.26",
"itertools",
"metrics",
@@ -5758,7 +5768,6 @@ dependencies = [
"r2d2",
"reqwest 0.12.4",
"routerify",
"scopeguard",
"serde",
"serde_json",
"strum",
@@ -7371,7 +7380,6 @@ dependencies = [
"base64 0.21.1",
"base64ct",
"bytes",
"camino",
"cc",
"chrono",
"clap",

View File

@@ -77,7 +77,7 @@ const_format = "0.2"
crc32c = "0.6"
crossbeam-deque = "0.8.5"
crossbeam-utils = "0.8.5"
dashmap = { version = "5.5.0", features = ["raw-api"] }
dashmap = { version = "6.0", features = ["raw-api"] }
either = "1.8"
enum-map = "2.4.2"
enumset = "1.0.12"

View File

@@ -467,6 +467,31 @@ RUN case "${PG_VERSION}" in \
make install -j $(getconf _NPROCESSORS_ONLN) && \
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_hint_plan.control
#########################################################################################
#
# Layer "kq-imcx-pg-build"
# compile kq_imcx extension
#
#########################################################################################
FROM build-deps AS kq-imcx-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
ENV PATH "/usr/local/pgsql/bin/:$PATH"
RUN apt-get update && \
apt-get install -y git libgtk2.0-dev libpq-dev libpam-dev libxslt-dev libkrb5-dev cmake && \
wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
mkdir kq_imcx-src && cd kq_imcx-src && tar xzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
mkdir build && cd build && \
cmake -DCMAKE_BUILD_TYPE=Release .. && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -
#########################################################################################
#
@@ -815,6 +840,7 @@ COPY --from=hll-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=plpgsql-check-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=kq-imcx-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -935,6 +961,7 @@ COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
COPY patches/pg_hintplan.patch /ext-src
#COPY --from=kq-imcx-pg-build /kq_imcx.tar.gz /ext-src
COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
COPY patches/pg_cron.patch /ext-src
#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src

View File

@@ -36,11 +36,11 @@ use utils::pid_file::{self, PidFileRead};
// it's waiting. If the process hasn't started/stopped after 5 seconds,
// it prints a notice that it's taking long, but keeps waiting.
//
const STOP_RETRY_TIMEOUT: Duration = Duration::from_secs(10);
const STOP_RETRIES: u128 = STOP_RETRY_TIMEOUT.as_millis() / RETRY_INTERVAL.as_millis();
const RETRY_INTERVAL: Duration = Duration::from_millis(100);
const DOT_EVERY_RETRIES: u128 = 10;
const NOTICE_AFTER_RETRIES: u128 = 50;
const RETRY_UNTIL_SECS: u64 = 10;
const RETRIES: u64 = (RETRY_UNTIL_SECS * 1000) / RETRY_INTERVAL_MILLIS;
const RETRY_INTERVAL_MILLIS: u64 = 100;
const DOT_EVERY_RETRIES: u64 = 10;
const NOTICE_AFTER_RETRIES: u64 = 50;
/// Argument to `start_process`, to indicate whether it should create pidfile or if the process creates
/// it itself.
@@ -52,7 +52,6 @@ pub enum InitialPidFile {
}
/// Start a background child process using the parameters given.
#[allow(clippy::too_many_arguments)]
pub async fn start_process<F, Fut, AI, A, EI>(
process_name: &str,
datadir: &Path,
@@ -60,7 +59,6 @@ pub async fn start_process<F, Fut, AI, A, EI>(
args: AI,
envs: EI,
initial_pid_file: InitialPidFile,
retry_timeout: &Duration,
process_status_check: F,
) -> anyhow::Result<()>
where
@@ -71,10 +69,6 @@ where
// Not generic AsRef<OsStr>, otherwise empty `envs` prevents type inference
EI: IntoIterator<Item = (String, String)>,
{
let retries: u128 = retry_timeout.as_millis() / RETRY_INTERVAL.as_millis();
if !datadir.metadata().context("stat datadir")?.is_dir() {
anyhow::bail!("`datadir` must be a directory when calling this function: {datadir:?}");
}
let log_path = datadir.join(format!("{process_name}.log"));
let process_log_file = fs::OpenOptions::new()
.create(true)
@@ -91,13 +85,7 @@ where
let background_command = command
.stdout(process_log_file)
.stderr(same_file_for_stderr)
.args(args)
// spawn all child processes in their datadir, useful for all kinds of things,
// not least cleaning up child processes e.g. after an unclean exit from the test suite:
// ```
// lsof -d cwd -a +D Users/cs/src/neon/test_output
// ```
.current_dir(datadir);
.args(args);
let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
fill_rust_env_vars(background_command),
@@ -133,7 +121,7 @@ where
.unwrap();
});
for retries in 0..retries {
for retries in 0..RETRIES {
match process_started(pid, pid_file_to_check, &process_status_check).await {
Ok(true) => {
println!("\n{process_name} started and passed status check, pid: {pid}");
@@ -151,7 +139,7 @@ where
print!(".");
io::stdout().flush().unwrap();
}
thread::sleep(RETRY_INTERVAL);
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
}
Err(e) => {
println!("error starting process {process_name:?}: {e:#}");
@@ -160,10 +148,9 @@ where
}
}
println!();
anyhow::bail!(format!(
"{} did not start+pass status checks within {:?} seconds",
process_name, retry_timeout
));
anyhow::bail!(
"{process_name} did not start+pass status checks within {RETRY_UNTIL_SECS} seconds"
);
}
/// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
@@ -219,7 +206,7 @@ pub fn stop_process(
}
pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
for retries in 0..STOP_RETRIES {
for retries in 0..RETRIES {
match process_has_stopped(pid) {
Ok(true) => {
println!("\n{process_name} stopped");
@@ -235,7 +222,7 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
print!(".");
io::stdout().flush().unwrap();
}
thread::sleep(RETRY_INTERVAL);
thread::sleep(Duration::from_millis(RETRY_INTERVAL_MILLIS));
}
Err(e) => {
println!("{process_name} with pid {pid} failed to stop: {e:#}");
@@ -244,10 +231,7 @@ pub fn wait_until_stopped(process_name: &str, pid: Pid) -> anyhow::Result<()> {
}
}
println!();
anyhow::bail!(format!(
"{} with pid {} did not stop in {:?} seconds",
process_name, pid, STOP_RETRY_TIMEOUT
));
anyhow::bail!("{process_name} with pid {pid} did not stop in {RETRY_UNTIL_SECS} seconds");
}
fn fill_rust_env_vars(cmd: &mut Command) -> &mut Command {

View File

@@ -36,7 +36,6 @@ use std::collections::{BTreeSet, HashMap};
use std::path::PathBuf;
use std::process::exit;
use std::str::FromStr;
use std::time::Duration;
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
use url::Host;
use utils::{
@@ -88,8 +87,7 @@ fn main() -> Result<()> {
handle_init(sub_args).map(Some)
} else {
// all other commands need an existing config
let mut env =
LocalEnv::load_config(&local_env::base_path()).context("Error loading config")?;
let mut env = LocalEnv::load_config().context("Error loading config")?;
let original_env = env.clone();
let rt = tokio::runtime::Builder::new_current_thread()
@@ -100,7 +98,7 @@ fn main() -> Result<()> {
let subcommand_result = match sub_name {
"tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
"timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
"start" => rt.block_on(handle_start_all(&env, get_start_timeout(sub_args))),
"start" => rt.block_on(handle_start_all(&env)),
"stop" => rt.block_on(handle_stop_all(sub_args, &env)),
"pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
"storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
@@ -366,8 +364,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
LocalEnv::init(init_conf, force)
.context("materialize initial neon_local environment on disk")?;
Ok(LocalEnv::load_config(&local_env::base_path())
.expect("freshly written config should be loadable"))
Ok(LocalEnv::load_config().expect("freshly written config should be loadable"))
}
/// The default pageserver is the one where CLI tenant/timeline operations are sent by default.
@@ -1049,20 +1046,10 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
))
}
fn get_start_timeout(args: &ArgMatches) -> &Duration {
let humantime_duration = args
.get_one::<humantime::Duration>("start-timeout")
.expect("invalid value for start-timeout");
humantime_duration.as_ref()
}
async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
match sub_match.subcommand() {
Some(("start", subcommand_args)) => {
if let Err(e) = get_pageserver(env, subcommand_args)?
.start(get_start_timeout(subcommand_args))
.await
{
if let Err(e) = get_pageserver(env, subcommand_args)?.start().await {
eprintln!("pageserver start failed: {e}");
exit(1);
}
@@ -1088,7 +1075,7 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
exit(1);
}
if let Err(e) = pageserver.start(get_start_timeout(sub_match)).await {
if let Err(e) = pageserver.start().await {
eprintln!("pageserver start failed: {e}");
exit(1);
}
@@ -1116,8 +1103,8 @@ async fn handle_storage_controller(
) -> Result<()> {
let svc = StorageController::from_env(env);
match sub_match.subcommand() {
Some(("start", start_match)) => {
if let Err(e) = svc.start(get_start_timeout(start_match)).await {
Some(("start", _start_match)) => {
if let Err(e) = svc.start().await {
eprintln!("start failed: {e}");
exit(1);
}
@@ -1176,10 +1163,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
"start" => {
let extra_opts = safekeeper_extra_opts(sub_args);
if let Err(e) = safekeeper
.start(extra_opts, get_start_timeout(sub_args))
.await
{
if let Err(e) = safekeeper.start(extra_opts).await {
eprintln!("safekeeper start failed: {}", e);
exit(1);
}
@@ -1205,10 +1189,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
}
let extra_opts = safekeeper_extra_opts(sub_args);
if let Err(e) = safekeeper
.start(extra_opts, get_start_timeout(sub_args))
.await
{
if let Err(e) = safekeeper.start(extra_opts).await {
eprintln!("safekeeper start failed: {}", e);
exit(1);
}
@@ -1221,18 +1202,15 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
Ok(())
}
async fn handle_start_all(
env: &local_env::LocalEnv,
retry_timeout: &Duration,
) -> anyhow::Result<()> {
async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
// Endpoints are not started automatically
broker::start_broker_process(env, retry_timeout).await?;
broker::start_broker_process(env).await?;
// Only start the storage controller if the pageserver is configured to need it
if env.control_plane_api.is_some() {
let storage_controller = StorageController::from_env(env);
if let Err(e) = storage_controller.start(retry_timeout).await {
if let Err(e) = storage_controller.start().await {
eprintln!("storage_controller start failed: {:#}", e);
try_stop_all(env, true).await;
exit(1);
@@ -1241,7 +1219,7 @@ async fn handle_start_all(
for ps_conf in &env.pageservers {
let pageserver = PageServerNode::from_env(env, ps_conf);
if let Err(e) = pageserver.start(retry_timeout).await {
if let Err(e) = pageserver.start().await {
eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
try_stop_all(env, true).await;
exit(1);
@@ -1250,7 +1228,7 @@ async fn handle_start_all(
for node in env.safekeepers.iter() {
let safekeeper = SafekeeperNode::from_env(env, node);
if let Err(e) = safekeeper.start(vec![], retry_timeout).await {
if let Err(e) = safekeeper.start(vec![]).await {
eprintln!("safekeeper {} start failed: {:#}", safekeeper.id, e);
try_stop_all(env, false).await;
exit(1);
@@ -1310,15 +1288,6 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
}
fn cli() -> Command {
let timeout_arg = Arg::new("start-timeout")
.long("start-timeout")
.short('t')
.global(true)
.help("timeout until we fail the command, e.g. 30s")
.value_parser(value_parser!(humantime::Duration))
.default_value("10s")
.required(false);
let branch_name_arg = Arg::new("branch-name")
.long("branch-name")
.help("Name of the branch to be created or used as an alias for other services")
@@ -1538,7 +1507,6 @@ fn cli() -> Command {
.subcommand(Command::new("status"))
.subcommand(Command::new("start")
.about("Start local pageserver")
.arg(timeout_arg.clone())
)
.subcommand(Command::new("stop")
.about("Stop local pageserver")
@@ -1546,15 +1514,13 @@ fn cli() -> Command {
)
.subcommand(Command::new("restart")
.about("Restart local pageserver")
.arg(timeout_arg.clone())
)
)
.subcommand(
Command::new("storage_controller")
.arg_required_else_help(true)
.about("Manage storage_controller")
.subcommand(Command::new("start").about("Start storage controller")
.arg(timeout_arg.clone()))
.subcommand(Command::new("start").about("Start storage controller"))
.subcommand(Command::new("stop").about("Stop storage controller")
.arg(stop_mode_arg.clone()))
)
@@ -1566,7 +1532,6 @@ fn cli() -> Command {
.about("Start local safekeeper")
.arg(safekeeper_id_arg.clone())
.arg(safekeeper_extra_opt_arg.clone())
.arg(timeout_arg.clone())
)
.subcommand(Command::new("stop")
.about("Stop local safekeeper")
@@ -1578,7 +1543,6 @@ fn cli() -> Command {
.arg(safekeeper_id_arg)
.arg(stop_mode_arg.clone())
.arg(safekeeper_extra_opt_arg)
.arg(timeout_arg.clone())
)
)
.subcommand(
@@ -1613,7 +1577,6 @@ fn cli() -> Command {
.arg(remote_ext_config_args)
.arg(create_test_user)
.arg(allow_multiple.clone())
.arg(timeout_arg.clone())
)
.subcommand(Command::new("reconfigure")
.about("Reconfigure the endpoint")
@@ -1665,7 +1628,6 @@ fn cli() -> Command {
.subcommand(
Command::new("start")
.about("Start page server and safekeepers")
.arg(timeout_arg.clone())
)
.subcommand(
Command::new("stop")

View File

@@ -5,18 +5,13 @@
//! ```text
//! .neon/safekeepers/<safekeeper id>
//! ```
use std::time::Duration;
use anyhow::Context;
use camino::Utf8PathBuf;
use crate::{background_process, local_env};
pub async fn start_broker_process(
env: &local_env::LocalEnv,
retry_timeout: &Duration,
) -> anyhow::Result<()> {
pub async fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
let broker = &env.broker;
let listen_addr = &broker.listen_addr;
@@ -32,7 +27,6 @@ pub async fn start_broker_process(
args,
[],
background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)),
retry_timeout,
|| async {
let url = broker.client_url();
let status_url = url.join("status").with_context(|| {

View File

@@ -42,8 +42,8 @@ pub struct LocalEnv {
// compute endpoints).
//
// This is not stored in the config file. Rather, this is the path where the
// config file itself is. It is read from the NEON_REPO_DIR env variable which
// must be an absolute path. If the env var is not set, $PWD/.neon is used.
// config file itself is. It is read from the NEON_REPO_DIR env variable or
// '.neon' if not given.
pub base_data_dir: PathBuf,
// Path to postgres distribution. It's expected that "bin", "include",
@@ -431,7 +431,9 @@ impl LocalEnv {
}
/// Construct `Self` from on-disk state.
pub fn load_config(repopath: &Path) -> anyhow::Result<Self> {
pub fn load_config() -> anyhow::Result<Self> {
let repopath = base_path();
if !repopath.exists() {
bail!(
"Neon config is not found in {}. You need to run 'neon_local init' first",
@@ -459,7 +461,7 @@ impl LocalEnv {
branch_name_mappings,
} = on_disk_config;
LocalEnv {
base_data_dir: repopath.to_owned(),
base_data_dir: repopath.clone(),
pg_distrib_dir,
neon_distrib_dir,
default_tenant_id,
@@ -480,7 +482,7 @@ impl LocalEnv {
"we ensure this during deserialization"
);
env.pageservers = {
let iter = std::fs::read_dir(repopath).context("open dir")?;
let iter = std::fs::read_dir(&repopath).context("open dir")?;
let mut pageservers = Vec::new();
for res in iter {
let dentry = res?;
@@ -717,25 +719,10 @@ impl LocalEnv {
}
pub fn base_path() -> PathBuf {
let path = match std::env::var_os("NEON_REPO_DIR") {
Some(val) => {
let path = PathBuf::from(val);
if !path.is_absolute() {
// repeat the env var in the error because our default is always absolute
panic!("NEON_REPO_DIR must be an absolute path, got {path:?}");
}
path
}
None => {
let pwd = std::env::current_dir()
// technically this can fail but it's quite unlikeley
.expect("determine current directory");
let pwd_abs = pwd.canonicalize().expect("canonicalize current directory");
pwd_abs.join(".neon")
}
};
assert!(path.is_absolute());
path
match std::env::var_os("NEON_REPO_DIR") {
Some(val) => PathBuf::from(val),
None => PathBuf::from(".neon"),
}
}
/// Generate a public/private key pair for JWT authentication

View File

@@ -158,8 +158,8 @@ impl PageServerNode {
.expect("non-Unicode path")
}
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
self.start_node(retry_timeout).await
pub async fn start(&self) -> anyhow::Result<()> {
self.start_node().await
}
fn pageserver_init(&self, conf: NeonLocalInitPageserverConf) -> anyhow::Result<()> {
@@ -214,15 +214,14 @@ impl PageServerNode {
Ok(())
}
async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
async fn start_node(&self) -> anyhow::Result<()> {
// TODO: using a thread here because start_process() is not async but we need to call check_status()
let datadir = self.repo_path();
print!(
"Starting pageserver node {} at '{}' in {:?}, retrying for {:?}",
"Starting pageserver node {} at '{}' in {:?}",
self.conf.id,
self.pg_connection_config.raw_address(),
datadir,
retry_timeout
datadir
);
io::stdout().flush().context("flush stdout")?;
@@ -240,7 +239,6 @@ impl PageServerNode {
args,
self.pageserver_env_variables()?,
background_process::InitialPidFile::Expect(self.pid_file()),
retry_timeout,
|| async {
let st = self.check_status().await;
match st {

View File

@@ -7,7 +7,6 @@
//! ```
use std::io::Write;
use std::path::PathBuf;
use std::time::Duration;
use std::{io, result};
use anyhow::Context;
@@ -112,16 +111,11 @@ impl SafekeeperNode {
.expect("non-Unicode path")
}
pub async fn start(
&self,
extra_opts: Vec<String>,
retry_timeout: &Duration,
) -> anyhow::Result<()> {
pub async fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<()> {
print!(
"Starting safekeeper at '{}' in '{}', retrying for {:?}",
"Starting safekeeper at '{}' in '{}'",
self.pg_connection_config.raw_address(),
self.datadir_path().display(),
retry_timeout,
self.datadir_path().display()
);
io::stdout().flush().unwrap();
@@ -206,7 +200,6 @@ impl SafekeeperNode {
&args,
self.safekeeper_env_variables()?,
background_process::InitialPidFile::Expect(self.pid_file()),
retry_timeout,
|| async {
match self.check_status().await {
Ok(()) => Ok(true),

View File

@@ -18,7 +18,7 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
use postgres_backend::AuthType;
use reqwest::Method;
use serde::{de::DeserializeOwned, Deserialize, Serialize};
use std::{fs, str::FromStr, time::Duration};
use std::{fs, str::FromStr};
use tokio::process::Command;
use tracing::instrument;
use url::Url;
@@ -46,7 +46,6 @@ const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
pub struct AttachHookRequest {
pub tenant_shard_id: TenantShardId,
pub node_id: Option<NodeId>,
pub generation_override: Option<i32>,
}
#[derive(Serialize, Deserialize)]
@@ -224,7 +223,7 @@ impl StorageController {
Ok(database_url)
}
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
pub async fn start(&self) -> anyhow::Result<()> {
// Start a vanilla Postgres process used by the storage controller for persistence.
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
.unwrap()
@@ -272,7 +271,6 @@ impl StorageController {
db_start_args,
[],
background_process::InitialPidFile::Create(self.postgres_pid_file()),
retry_timeout,
|| self.pg_isready(&pg_bin_dir),
)
.await?;
@@ -315,19 +313,16 @@ impl StorageController {
args.push(format!("--split-threshold={split_threshold}"))
}
args.push(format!(
"--neon-local-repo-dir={}",
self.env.base_data_dir.display()
));
background_process::start_process(
COMMAND,
&self.env.base_data_dir,
&self.env.storage_controller_bin(),
args,
[],
[(
"NEON_REPO_DIR".to_string(),
self.env.base_data_dir.to_string_lossy().to_string(),
)],
background_process::InitialPidFile::Create(self.pid_file()),
retry_timeout,
|| async {
match self.ready().await {
Ok(_) => Ok(true),
@@ -445,7 +440,6 @@ impl StorageController {
let request = AttachHookRequest {
tenant_shard_id,
node_id: Some(pageserver_id),
generation_override: None,
};
let response = self

View File

@@ -5,3 +5,4 @@ TODO:
- shared across tenants
- store pages from layer files
- store pages from "in-memory layer"
- store materialized pages

View File

@@ -134,7 +134,7 @@ depends on that, so if you change it, bad things will happen.
#### page_cache_size
Size of the page cache. Unit is
Size of the page cache, to hold materialized page versions. Unit is
number of 8 kB blocks. The default is 8192, which means 64 MB.
#### max_file_descriptors

View File

@@ -209,7 +209,6 @@ pub enum NodeSchedulingPolicy {
Active,
Filling,
Pause,
PauseForRestart,
Draining,
}
@@ -221,7 +220,6 @@ impl FromStr for NodeSchedulingPolicy {
"active" => Ok(Self::Active),
"filling" => Ok(Self::Filling),
"pause" => Ok(Self::Pause),
"pause_for_restart" => Ok(Self::PauseForRestart),
"draining" => Ok(Self::Draining),
_ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
}
@@ -235,7 +233,6 @@ impl From<NodeSchedulingPolicy> for String {
Active => "active",
Filling => "filling",
Pause => "pause",
PauseForRestart => "pause_for_restart",
Draining => "draining",
}
.to_string()

View File

@@ -293,6 +293,22 @@ pub struct TenantCreateRequest {
pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
}
#[derive(Deserialize, Debug)]
#[serde(deny_unknown_fields)]
pub struct TenantLoadRequest {
#[serde(default)]
#[serde(skip_serializing_if = "Option::is_none")]
pub generation: Option<u32>,
}
impl std::ops::Deref for TenantCreateRequest {
type Target = TenantConfig;
fn deref(&self) -> &Self::Target {
&self.config
}
}
/// An alternative representation of `pageserver::tenant::TenantConf` with
/// simpler types.
#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]

View File

@@ -14,9 +14,8 @@ aws-config.workspace = true
aws-sdk-s3.workspace = true
aws-credential-types.workspace = true
bytes.workspace = true
camino = { workspace = true, features = ["serde1"] }
camino.workspace = true
humantime.workspace = true
humantime-serde.workspace = true
hyper = { workspace = true, features = ["stream"] }
futures.workspace = true
rand.workspace = true

View File

@@ -36,6 +36,7 @@ use futures::stream::Stream;
use serde::{Deserialize, Serialize};
use tokio::sync::Semaphore;
use tokio_util::sync::CancellationToken;
use toml_edit::Item;
use tracing::info;
pub use self::{
@@ -450,7 +451,7 @@ impl GenericRemoteStorage {
pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
let timeout = storage_config.timeout;
Ok(match &storage_config.storage {
RemoteStorageKind::LocalFs { local_path: path } => {
RemoteStorageKind::LocalFs(path) => {
info!("Using fs root '{path}' as a remote storage");
Self::LocalFs(LocalFs::new(path.clone(), timeout)?)
}
@@ -526,28 +527,21 @@ impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
}
/// External backup storage configuration, enough for creating a client for that storage.
#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RemoteStorageConfig {
/// The storage connection configuration.
#[serde(flatten)]
pub storage: RemoteStorageKind,
/// A common timeout enforced for all requests after concurrency limiter permit has been
/// acquired.
#[serde(with = "humantime_serde", default = "default_timeout")]
pub timeout: Duration,
}
fn default_timeout() -> Duration {
RemoteStorageConfig::DEFAULT_TIMEOUT
}
/// A kind of a remote storage to connect to, with its connection configuration.
#[derive(Debug, Clone, PartialEq, Eq, Deserialize)]
#[serde(untagged)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum RemoteStorageKind {
/// Storage based on local file system.
/// Specify a root folder to place all stored files into.
LocalFs { local_path: Utf8PathBuf },
LocalFs(Utf8PathBuf),
/// AWS S3 based storage, storing all files in the S3 bucket
/// specified by the config
AwsS3(S3Config),
@@ -557,7 +551,7 @@ pub enum RemoteStorageKind {
}
/// AWS S3 bucket coordinates and access credentials to manage the bucket contents (read and write).
#[derive(Clone, PartialEq, Eq, serde::Deserialize)]
#[derive(Clone, PartialEq, Eq)]
pub struct S3Config {
/// Name of the bucket to connect to.
pub bucket_name: String,
@@ -574,24 +568,11 @@ pub struct S3Config {
pub endpoint: Option<String>,
/// AWS S3 has various limits on its API calls, we need not to exceed those.
/// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
#[serde(default = "default_remote_storage_s3_concurrency_limit")]
pub concurrency_limit: NonZeroUsize,
#[serde(default = "default_max_keys_per_list_response")]
pub max_keys_per_list_response: Option<i32>,
#[serde(deserialize_with = "deserialize_storage_class", default)]
pub upload_storage_class: Option<StorageClass>,
}
fn default_remote_storage_s3_concurrency_limit() -> NonZeroUsize {
DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
.try_into()
.unwrap()
}
fn default_max_keys_per_list_response() -> Option<i32> {
DEFAULT_MAX_KEYS_PER_LIST_RESPONSE
}
impl Debug for S3Config {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("S3Config")
@@ -608,7 +589,7 @@ impl Debug for S3Config {
}
/// Azure bucket coordinates and access credentials to manage the bucket contents (read and write).
#[derive(Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
#[derive(Clone, PartialEq, Eq)]
pub struct AzureConfig {
/// Name of the container to connect to.
pub container_name: String,
@@ -620,16 +601,10 @@ pub struct AzureConfig {
pub prefix_in_container: Option<String>,
/// Azure has various limits on its API calls, we need not to exceed those.
/// See [`DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT`] for more details.
#[serde(default = "default_remote_storage_azure_concurrency_limit")]
pub concurrency_limit: NonZeroUsize,
#[serde(default = "default_max_keys_per_list_response")]
pub max_keys_per_list_response: Option<i32>,
}
fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize {
NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap()
}
impl Debug for AzureConfig {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("AzureConfig")
@@ -646,47 +621,167 @@ impl Debug for AzureConfig {
}
}
fn deserialize_storage_class<'de, D: serde::Deserializer<'de>>(
deserializer: D,
) -> Result<Option<StorageClass>, D::Error> {
Option::<String>::deserialize(deserializer).and_then(|s| {
if let Some(s) = s {
use serde::de::Error;
let storage_class = StorageClass::from_str(&s).expect("infallible");
#[allow(deprecated)]
if matches!(storage_class, StorageClass::Unknown(_)) {
return Err(D::Error::custom(format!(
"Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}",
StorageClass::values()
)));
}
Ok(Some(storage_class))
} else {
Ok(None)
}
})
}
impl RemoteStorageConfig {
pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
let document: toml_edit::Document = match toml {
toml_edit::Item::Table(toml) => toml.clone().into(),
toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
toml.clone().into_table().into()
}
_ => bail!("toml not a table or inline table"),
};
let local_path = toml.get("local_path");
let bucket_name = toml.get("bucket_name");
let bucket_region = toml.get("bucket_region");
let container_name = toml.get("container_name");
let container_region = toml.get("container_region");
if document.is_empty() {
return Ok(None);
let use_azure = container_name.is_some() && container_region.is_some();
let default_concurrency_limit = if use_azure {
DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
} else {
DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT
};
let concurrency_limit = NonZeroUsize::new(
parse_optional_integer("concurrency_limit", toml)?.unwrap_or(default_concurrency_limit),
)
.context("Failed to parse 'concurrency_limit' as a positive integer")?;
let max_keys_per_list_response =
parse_optional_integer::<i32, _>("max_keys_per_list_response", toml)
.context("Failed to parse 'max_keys_per_list_response' as a positive integer")?
.or(DEFAULT_MAX_KEYS_PER_LIST_RESPONSE);
let endpoint = toml
.get("endpoint")
.map(|endpoint| parse_toml_string("endpoint", endpoint))
.transpose()?;
let timeout = toml
.get("timeout")
.map(|timeout| {
timeout
.as_str()
.ok_or_else(|| anyhow::Error::msg("timeout was not a string"))
})
.transpose()
.and_then(|timeout| {
timeout
.map(humantime::parse_duration)
.transpose()
.map_err(anyhow::Error::new)
})
.context("parse timeout")?
.unwrap_or(Self::DEFAULT_TIMEOUT);
if timeout < Duration::from_secs(1) {
bail!("timeout was specified as {timeout:?} which is too low");
}
Ok(Some(toml_edit::de::from_document(document)?))
let storage = match (
local_path,
bucket_name,
bucket_region,
container_name,
container_region,
) {
// no 'local_path' nor 'bucket_name' options are provided, consider this remote storage disabled
(None, None, None, None, None) => return Ok(None),
(_, Some(_), None, ..) => {
bail!("'bucket_region' option is mandatory if 'bucket_name' is given ")
}
(_, None, Some(_), ..) => {
bail!("'bucket_name' option is mandatory if 'bucket_region' is given ")
}
(None, Some(bucket_name), Some(bucket_region), ..) => {
RemoteStorageKind::AwsS3(S3Config {
bucket_name: parse_toml_string("bucket_name", bucket_name)?,
bucket_region: parse_toml_string("bucket_region", bucket_region)?,
prefix_in_bucket: toml
.get("prefix_in_bucket")
.map(|prefix_in_bucket| {
parse_toml_string("prefix_in_bucket", prefix_in_bucket)
})
.transpose()?,
endpoint,
concurrency_limit,
max_keys_per_list_response,
upload_storage_class: toml
.get("upload_storage_class")
.map(|prefix_in_bucket| -> anyhow::Result<_> {
let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?;
let storage_class = StorageClass::from_str(&s).expect("infallible");
#[allow(deprecated)]
if matches!(storage_class, StorageClass::Unknown(_)) {
bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values());
}
Ok(storage_class)
})
.transpose()?,
})
}
(_, _, _, Some(_), None) => {
bail!("'container_name' option is mandatory if 'container_region' is given ")
}
(_, _, _, None, Some(_)) => {
bail!("'container_name' option is mandatory if 'container_region' is given ")
}
(None, None, None, Some(container_name), Some(container_region)) => {
RemoteStorageKind::AzureContainer(AzureConfig {
container_name: parse_toml_string("container_name", container_name)?,
storage_account: toml
.get("storage_account")
.map(|storage_account| {
parse_toml_string("storage_account", storage_account)
})
.transpose()?,
container_region: parse_toml_string("container_region", container_region)?,
prefix_in_container: toml
.get("prefix_in_container")
.map(|prefix_in_container| {
parse_toml_string("prefix_in_container", prefix_in_container)
})
.transpose()?,
concurrency_limit,
max_keys_per_list_response,
})
}
(Some(local_path), None, None, None, None) => RemoteStorageKind::LocalFs(
Utf8PathBuf::from(parse_toml_string("local_path", local_path)?),
),
(Some(_), Some(_), ..) => {
bail!("'local_path' and 'bucket_name' are mutually exclusive")
}
(Some(_), _, _, Some(_), Some(_)) => {
bail!("local_path and 'container_name' are mutually exclusive")
}
};
Ok(Some(RemoteStorageConfig { storage, timeout }))
}
}
// Helper functions to parse a toml Item
fn parse_optional_integer<I, E>(name: &str, item: &toml_edit::Item) -> anyhow::Result<Option<I>>
where
I: TryFrom<i64, Error = E>,
E: std::error::Error + Send + Sync + 'static,
{
let toml_integer = match item.get(name) {
Some(item) => item
.as_integer()
.with_context(|| format!("configure option {name} is not an integer"))?,
None => return Ok(None),
};
I::try_from(toml_integer)
.map(Some)
.with_context(|| format!("configure option {name} is too large"))
}
fn parse_toml_string(name: &str, item: &Item) -> anyhow::Result<String> {
let s = item
.as_str()
.with_context(|| format!("configure option {name} is not a string"))?;
Ok(s.to_string())
}
struct ConcurrencyLimiter {
// Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
// Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
@@ -733,11 +828,6 @@ impl ConcurrencyLimiter {
mod tests {
use super::*;
fn parse(input: &str) -> anyhow::Result<Option<RemoteStorageConfig>> {
let toml = input.parse::<toml_edit::Document>().unwrap();
RemoteStorageConfig::from_toml(toml.as_item())
}
#[test]
fn test_object_name() {
let k = RemotePath::new(Utf8Path::new("a/b/c")).unwrap();
@@ -765,71 +855,18 @@ mod tests {
let input = "local_path = '.'
timeout = '5s'";
let config = parse(input).unwrap().expect("it exists");
let toml = input.parse::<toml_edit::Document>().unwrap();
let config = RemoteStorageConfig::from_toml(toml.as_item())
.unwrap()
.expect("it exists");
assert_eq!(
config,
RemoteStorageConfig {
storage: RemoteStorageKind::LocalFs {
local_path: Utf8PathBuf::from(".")
},
storage: RemoteStorageKind::LocalFs(Utf8PathBuf::from(".")),
timeout: Duration::from_secs(5)
}
);
}
#[test]
fn test_s3_parsing() {
let toml = "\
bucket_name = 'foo-bar'
bucket_region = 'eu-central-1'
upload_storage_class = 'INTELLIGENT_TIERING'
timeout = '7s'
";
let config = parse(toml).unwrap().expect("it exists");
assert_eq!(
config,
RemoteStorageConfig {
storage: RemoteStorageKind::AwsS3(S3Config {
bucket_name: "foo-bar".into(),
bucket_region: "eu-central-1".into(),
prefix_in_bucket: None,
endpoint: None,
concurrency_limit: default_remote_storage_s3_concurrency_limit(),
max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
upload_storage_class: Some(StorageClass::IntelligentTiering),
}),
timeout: Duration::from_secs(7)
}
);
}
#[test]
fn test_azure_parsing() {
let toml = "\
container_name = 'foo-bar'
container_region = 'westeurope'
upload_storage_class = 'INTELLIGENT_TIERING'
timeout = '7s'
";
let config = parse(toml).unwrap().expect("it exists");
assert_eq!(
config,
RemoteStorageConfig {
storage: RemoteStorageKind::AzureContainer(AzureConfig {
container_name: "foo-bar".into(),
storage_account: None,
container_region: "westeurope".into(),
prefix_in_container: None,
concurrency_limit: default_remote_storage_azure_concurrency_limit(),
max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
}),
timeout: Duration::from_secs(7)
}
);
}
}

View File

@@ -39,8 +39,8 @@ use crate::tenant::{
use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
use crate::{tenant::config::TenantConf, virtual_file};
use crate::{
TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME,
TIMELINE_DELETE_MARK_SUFFIX,
IGNORED_TENANT_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
};
use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -811,6 +811,11 @@ impl PageServerConf {
self.tenants_path().join(tenant_shard_id.to_string())
}
pub fn tenant_ignore_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
self.tenant_path(tenant_shard_id)
.join(IGNORED_TENANT_FILE_NAME)
}
/// Points to a place in pageserver's local directory,
/// where certain tenant's tenantconf file should be located.
///
@@ -1463,7 +1468,7 @@ broker_endpoint = '{broker_endpoint}'
assert_eq!(
parsed_remote_storage_config,
RemoteStorageConfig {
storage: RemoteStorageKind::LocalFs { local_path: local_storage_path.clone() },
storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
},
"Remote storage config should correctly parse the local FS config and fill other storage defaults"

View File

@@ -850,9 +850,7 @@ mod test {
std::fs::create_dir_all(remote_fs_dir)?;
let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
let storage_config = RemoteStorageConfig {
storage: RemoteStorageKind::LocalFs {
local_path: remote_fs_dir.clone(),
},
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
};
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();

View File

@@ -78,14 +78,29 @@ paths:
delete:
description: |
Attempts to delete specified tenant. 500, 503 and 409 errors should be retried. Deleting
a non-existent tenant is considered successful (returns 200).
Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
404 means that deletion successfully finished"
responses:
"200":
description: Tenant was successfully deleted, or was already not found.
"503":
description: Service is unavailable, or tenant is already being modified (perhaps concurrently deleted)
"404":
description: Tenant not found. This is a success result, equivalent to 200.
content:
application/json:
schema:
$ref: "#/components/schemas/NotFoundError"
"409":
description: Deletion is already in progress, continue polling
content:
application/json:
schema:
$ref: "#/components/schemas/ConflictError"
"412":
description: Deletion may not proceed, tenant is not in Active state
content:
application/json:
schema:
$ref: "#/components/schemas/PreconditionFailedError"
/v1/tenant/{tenant_id}/time_travel_remote_storage:
parameters:
@@ -374,6 +389,48 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/ConflictError"
/v1/tenant/{tenant_id}/ignore:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
post:
description: |
Remove tenant data (including all corresponding timelines) from pageserver's memory.
Files on local disk and remote storage are not affected.
Future pageserver restarts won't load the data back until `load` is called on such tenant.
responses:
"200":
description: Tenant ignored
/v1/tenant/{tenant_id}/load:
parameters:
- name: tenant_id
in: path
required: true
schema:
type: string
post:
description: |
Schedules an operation that attempts to load a tenant from the local disk and
synchronise it with the remote storage (if enabled), repeating pageserver's restart logic for tenant load.
If the tenant was ignored before, removes the ignore mark and continues with load scheduling.
Errors if the tenant is absent on disk, already present in memory or fails to schedule its load.
Scheduling a load does not mean that the tenant would load successfully, check tenant status to ensure load correctness.
requestBody:
required: false
content:
application/json:
schema:
$ref: "#/components/schemas/TenantLoadRequest"
responses:
"202":
description: Tenant scheduled to load successfully
/v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
parameters:

View File

@@ -36,7 +36,7 @@ use pageserver_api::models::TopTenantShardsRequest;
use pageserver_api::models::TopTenantShardsResponse;
use pageserver_api::models::{
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
TenantLocationConfigRequest,
TenantLoadRequest, TenantLocationConfigRequest,
};
use pageserver_api::shard::ShardCount;
use pageserver_api::shard::TenantShardId;
@@ -205,6 +205,7 @@ impl From<TenantSlotError> for ApiError {
NotFound(tenant_id) => {
ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
}
e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")),
InProgress => {
ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
}
@@ -334,10 +335,13 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
use crate::tenant::delete::DeleteTenantError::*;
match value {
Get(g) => ApiError::from(g),
e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
Timeline(t) => ApiError::from(t),
NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
SlotError(e) => e.into(),
SlotUpsertError(e) => e.into(),
Other(o) => ApiError::InternalServerError(o),
e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
Cancelled => ApiError::ShuttingDown,
}
}
@@ -887,6 +891,8 @@ async fn tenant_detach_handler(
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let detach_ignored: Option<bool> = parse_query_param(&request, "detach_ignored")?;
// This is a legacy API (`/location_conf` is the replacement). It only supports unsharded tenants
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
@@ -894,7 +900,12 @@ async fn tenant_detach_handler(
let conf = state.conf;
state
.tenant_manager
.detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client)
.detach_tenant(
conf,
tenant_shard_id,
detach_ignored.unwrap_or(false),
&state.deletion_queue_client,
)
.instrument(info_span!("tenant_detach", %tenant_id, shard_id=%tenant_shard_id.shard_slug()))
.await?;
@@ -921,6 +932,54 @@ async fn tenant_reset_handler(
json_response(StatusCode::OK, ())
}
async fn tenant_load_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
let maybe_body: Option<TenantLoadRequest> = json_request_or_empty_body(&mut request).await?;
let state = get_state(&request);
// The /load request is only usable when control_plane_api is not set. Once it is set, callers
// should always use /attach instead.
let generation = get_request_generation(state, maybe_body.as_ref().and_then(|r| r.generation))?;
mgr::load_tenant(
state.conf,
tenant_id,
generation,
state.broker_client.clone(),
state.remote_storage.clone(),
state.deletion_queue_client.clone(),
&ctx,
)
.instrument(info_span!("load", %tenant_id))
.await?;
json_response(StatusCode::ACCEPTED, ())
}
async fn tenant_ignore_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
check_permission(&request, Some(tenant_id))?;
let state = get_state(&request);
let conf = state.conf;
mgr::ignore_tenant(conf, tenant_id)
.instrument(info_span!("ignore_tenant", %tenant_id))
.await?;
json_response(StatusCode::OK, ())
}
async fn tenant_list_handler(
request: Request<Body>,
_cancel: CancellationToken,
@@ -1012,16 +1071,23 @@ async fn tenant_delete_handler(
let state = get_state(&request);
state
let status = state
.tenant_manager
.delete_tenant(tenant_shard_id)
.delete_tenant(tenant_shard_id, ACTIVE_TENANT_TIMEOUT)
.instrument(info_span!("tenant_delete_handler",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug()
))
.await?;
json_response(StatusCode::OK, ())
// Callers use 404 as success for deletions, for historical reasons.
if status == StatusCode::NOT_FOUND {
return Err(ApiError::NotFound(
anyhow::anyhow!("Deletion complete").into(),
));
}
json_response(status, ())
}
/// HTTP endpoint to query the current tenant_size of a tenant.
@@ -1441,7 +1507,7 @@ async fn put_tenant_location_config_handler(
if let LocationConfigMode::Detached = request_data.config.mode {
if let Err(e) = state
.tenant_manager
.detach_tenant(conf, tenant_shard_id, &state.deletion_queue_client)
.detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
.instrument(info_span!("tenant_detach",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug()
@@ -2698,6 +2764,12 @@ pub fn make_router(
.post("/v1/tenant/:tenant_shard_id/reset", |r| {
api_handler(r, tenant_reset_handler)
})
.post("/v1/tenant/:tenant_id/load", |r| {
api_handler(r, tenant_load_handler)
})
.post("/v1/tenant/:tenant_id/ignore", |r| {
api_handler(r, tenant_ignore_handler)
})
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
|r| api_handler(r, timeline_preserve_initdb_handler),

View File

@@ -136,6 +136,13 @@ pub(crate) const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
pub(crate) const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
/// A marker file to prevent pageserver from loading a certain tenant on restart.
/// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
/// `ignore` management API command, that expects the ignored tenant to be properly loaded
/// into pageserver's memory before being ignored.
/// Full path: `tenants/<tenant_id>/___ignored_tenant`.
pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant";
pub fn is_temporary(path: &Utf8Path) -> bool {
match path.file_name() {
Some(name) => name.ends_with(TEMP_FILE_SUFFIX),

View File

@@ -145,6 +145,14 @@ impl ReconstructTimeMetrics {
}
}
pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_materialized_cache_hits_direct_total",
"Number of cache hits from materialized page cache without redo",
)
.expect("failed to define a metric")
});
pub(crate) struct ReconstructDataTimeMetrics {
singular: Histogram,
vectored: Histogram,
@@ -174,6 +182,14 @@ pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> =
}
});
pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_materialized_cache_hits_total",
"Number of cache hits from materialized page cache",
)
.expect("failed to define a metric")
});
pub(crate) struct GetVectoredLatency {
map: EnumMap<TaskKind, Option<Histogram>>,
}
@@ -282,8 +298,12 @@ pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
});
pub(crate) struct PageCacheMetricsForTaskKind {
pub read_accesses_materialized_page: IntCounter,
pub read_accesses_immutable: IntCounter,
pub read_hits_immutable: IntCounter,
pub read_hits_materialized_page_exact: IntCounter,
pub read_hits_materialized_page_older_lsn: IntCounter,
}
pub(crate) struct PageCacheMetrics {
@@ -316,6 +336,16 @@ pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMet
let content_kind = <PageContentKind as enum_map::Enum>::from_usize(content_kind);
let content_kind: &'static str = content_kind.into();
PageCacheMetricsForTaskKind {
read_accesses_materialized_page: {
PAGE_CACHE_READ_ACCESSES
.get_metric_with_label_values(&[
task_kind,
"materialized_page",
content_kind,
])
.unwrap()
},
read_accesses_immutable: {
PAGE_CACHE_READ_ACCESSES
.get_metric_with_label_values(&[task_kind, "immutable", content_kind])
@@ -327,6 +357,28 @@ pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMet
.get_metric_with_label_values(&[task_kind, "immutable", content_kind, "-"])
.unwrap()
},
read_hits_materialized_page_exact: {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&[
task_kind,
"materialized_page",
content_kind,
"exact",
])
.unwrap()
},
read_hits_materialized_page_older_lsn: {
PAGE_CACHE_READ_HITS
.get_metric_with_label_values(&[
task_kind,
"materialized_page",
content_kind,
"older_lsn",
])
.unwrap()
},
}
}))
})),
@@ -342,6 +394,7 @@ pub(crate) struct PageCacheSizeMetrics {
pub max_bytes: UIntGauge,
pub current_bytes_immutable: UIntGauge,
pub current_bytes_materialized_page: UIntGauge,
}
static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
@@ -367,6 +420,11 @@ pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
.get_metric_with_label_values(&["immutable"])
.unwrap()
},
current_bytes_materialized_page: {
PAGE_CACHE_SIZE_CURRENT_BYTES
.get_metric_with_label_values(&["materialized_page"])
.unwrap()
},
});
pub(crate) mod page_cache_eviction_metrics {
@@ -1347,23 +1405,17 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
.map(|ms| (ms as f64) / 1000.0)
});
pub(crate) struct BasebackupQueryTime {
ok: Histogram,
error: Histogram,
}
pub(crate) struct BasebackupQueryTime(HistogramVec);
pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
let vec = register_histogram_vec!(
"pageserver_basebackup_query_seconds",
"Histogram of basebackup queries durations, by result type",
&["result"],
COMPUTE_STARTUP_BUCKETS.to_vec(),
)
.expect("failed to define a metric");
BasebackupQueryTime {
ok: vec.get_metric_with_label_values(&["ok"]).unwrap(),
error: vec.get_metric_with_label_values(&["error"]).unwrap(),
}
BasebackupQueryTime({
register_histogram_vec!(
"pageserver_basebackup_query_seconds",
"Histogram of basebackup queries durations, by result type",
&["result"],
COMPUTE_STARTUP_BUCKETS.to_vec(),
)
.expect("failed to define a metric")
})
});
pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
@@ -1418,11 +1470,12 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
elapsed
}
};
let metric = if res.is_ok() {
&self.parent.ok
} else {
&self.parent.error
};
let label_value = if res.is_ok() { "ok" } else { "error" };
let metric = self
.parent
.0
.get_metric_with_label_values(&[label_value])
.unwrap();
metric.observe(ex_throttled.as_secs_f64());
}
}
@@ -2865,11 +2918,13 @@ pub fn preinitialize_metrics() {
// FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
// order:
// - global metrics reside in a Lazy<PageserverMetrics>
// - access via crate::metrics::PS_METRICS.some_metric.inc()
// - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
// - could move the statics into TimelineMetrics::new()?
// counters
[
&MATERIALIZED_PAGE_CACHE_HIT,
&MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
&UNEXPECTED_ONDEMAND_DOWNLOADS,
&WALRECEIVER_STARTED_CONNECTIONS,
&WALRECEIVER_BROKER_UPDATES,
@@ -2931,5 +2986,4 @@ pub fn preinitialize_metrics() {
// Custom
Lazy::force(&RECONSTRUCT_TIME);
Lazy::force(&tenant_throttling::TIMELINE_GET);
Lazy::force(&BASEBACKUP_QUERY_TIME);
}

View File

@@ -17,6 +17,7 @@
//!
//! Two types of pages are supported:
//!
//! * **Materialized pages**, filled & used by page reconstruction
//! * **Immutable File pages**, filled & used by [`crate::tenant::block_io`] and [`crate::tenant::ephemeral_file`].
//!
//! Note that [`crate::tenant::ephemeral_file::EphemeralFile`] is generally mutable, but, it's append-only.
@@ -27,6 +28,9 @@
//! Page cache maps from a cache key to a buffer slot.
//! The cache key uniquely identifies the piece of data that is being cached.
//!
//! The cache key for **materialized pages** is [`TenantShardId`], [`TimelineId`], [`Key`], and [`Lsn`].
//! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
//!
//! The cache key for **immutable file** pages is [`FileId`] and a block number.
//! Users of page cache that wish to page-cache an arbitrary (immutable!) on-disk file do the following:
//! * Have a mechanism to deterministically associate the on-disk file with a [`FileId`].
@@ -78,10 +82,13 @@ use std::{
use anyhow::Context;
use once_cell::sync::OnceCell;
use pageserver_api::shard::TenantShardId;
use utils::{id::TimelineId, lsn::Lsn};
use crate::{
context::RequestContext,
metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
repository::Key,
};
static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -132,7 +139,33 @@ pub fn next_file_id() -> FileId {
#[derive(Debug, PartialEq, Eq, Clone)]
#[allow(clippy::enum_variant_names)]
enum CacheKey {
ImmutableFilePage { file_id: FileId, blkno: u32 },
MaterializedPage {
hash_key: MaterializedPageHashKey,
lsn: Lsn,
},
ImmutableFilePage {
file_id: FileId,
blkno: u32,
},
}
#[derive(Debug, PartialEq, Eq, Hash, Clone)]
struct MaterializedPageHashKey {
/// Why is this TenantShardId rather than TenantId?
///
/// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant. However, this
/// this not the case for certain internally-generated pages (e.g. relation sizes). In future, we may make this
/// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are
/// special-cased in some other way.
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
key: Key,
}
#[derive(Clone)]
struct Version {
lsn: Lsn,
slot_idx: usize,
}
struct Slot {
@@ -203,6 +236,17 @@ impl SlotInner {
}
pub struct PageCache {
/// This contains the mapping from the cache key to buffer slot that currently
/// contains the page, if any.
///
/// TODO: This is protected by a single lock. If that becomes a bottleneck,
/// this HashMap can be replaced with a more concurrent version, there are
/// plenty of such crates around.
///
/// If you add support for caching different kinds of objects, each object kind
/// can have a separate mapping map, next to this field.
materialized_page_map: std::sync::RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,
immutable_page_map: std::sync::RwLock<HashMap<(FileId, u32), usize>>,
/// The actual buffers with their metadata.
@@ -327,14 +371,175 @@ pub enum ReadBufResult<'a> {
}
impl PageCache {
//
// Section 1.1: Public interface functions for looking up and memorizing materialized page
// versions in the page cache
//
/// Look up a materialized page version.
///
/// The 'lsn' is an upper bound, this will return the latest version of
/// the given block, but not newer than 'lsn'. Returns the actual LSN of the
/// returned page.
pub async fn lookup_materialized_page(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
key: &Key,
lsn: Lsn,
ctx: &RequestContext,
) -> Option<(Lsn, PageReadGuard)> {
let Ok(permit) = self.try_get_pinned_slot_permit().await else {
return None;
};
crate::metrics::PAGE_CACHE
.for_ctx(ctx)
.read_accesses_materialized_page
.inc();
let mut cache_key = CacheKey::MaterializedPage {
hash_key: MaterializedPageHashKey {
tenant_shard_id,
timeline_id,
key: *key,
},
lsn,
};
if let Some(guard) = self
.try_lock_for_read(&mut cache_key, &mut Some(permit))
.await
{
if let CacheKey::MaterializedPage {
hash_key: _,
lsn: available_lsn,
} = cache_key
{
if available_lsn == lsn {
crate::metrics::PAGE_CACHE
.for_ctx(ctx)
.read_hits_materialized_page_exact
.inc();
} else {
crate::metrics::PAGE_CACHE
.for_ctx(ctx)
.read_hits_materialized_page_older_lsn
.inc();
}
Some((available_lsn, guard))
} else {
panic!("unexpected key type in slot");
}
} else {
None
}
}
///
/// Store an image of the given page in the cache.
///
pub async fn memorize_materialized_page(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
key: Key,
lsn: Lsn,
img: &[u8],
) -> anyhow::Result<()> {
let cache_key = CacheKey::MaterializedPage {
hash_key: MaterializedPageHashKey {
tenant_shard_id,
timeline_id,
key,
},
lsn,
};
let mut permit = Some(self.try_get_pinned_slot_permit().await?);
loop {
// First check if the key already exists in the cache.
if let Some(slot_idx) = self.search_mapping_exact(&cache_key) {
// The page was found in the mapping. Lock the slot, and re-check
// that it's still what we expected (because we don't released the mapping
// lock already, another thread could have evicted the page)
let slot = &self.slots[slot_idx];
let inner = slot.inner.write().await;
if inner.key.as_ref() == Some(&cache_key) {
slot.inc_usage_count();
debug_assert!(
{
let guard = inner.permit.lock().unwrap();
guard.upgrade().is_none()
},
"we hold a write lock, so, no one else should have a permit"
);
debug_assert_eq!(inner.buf.len(), img.len());
// We already had it in cache. Another thread must've put it there
// concurrently. Check that it had the same contents that we
// replayed.
assert!(inner.buf == img);
return Ok(());
}
}
debug_assert!(permit.is_some());
// Not found. Find a victim buffer
let (slot_idx, mut inner) = self
.find_victim(permit.as_ref().unwrap())
.await
.context("Failed to find evict victim")?;
// Insert mapping for this. At this point, we may find that another
// thread did the same thing concurrently. In that case, we evicted
// our victim buffer unnecessarily. Put it into the free list and
// continue with the slot that the other thread chose.
if let Some(_existing_slot_idx) = self.try_insert_mapping(&cache_key, slot_idx) {
// TODO: put to free list
// We now just loop back to start from beginning. This is not
// optimal, we'll perform the lookup in the mapping again, which
// is not really necessary because we already got
// 'existing_slot_idx'. But this shouldn't happen often enough
// to matter much.
continue;
}
// Make the slot ready
let slot = &self.slots[slot_idx];
inner.key = Some(cache_key.clone());
slot.set_usage_count(1);
// Create a write guard for the slot so we go through the expected motions.
debug_assert!(
{
let guard = inner.permit.lock().unwrap();
guard.upgrade().is_none()
},
"we hold a write lock, so, no one else should have a permit"
);
let mut write_guard = PageWriteGuard {
state: PageWriteGuardState::Invalid {
_permit: permit.take().unwrap(),
inner,
},
};
write_guard.copy_from_slice(img);
let _ = write_guard.mark_valid();
return Ok(());
}
}
// Section 1.2: Public interface functions for working with immutable file pages.
pub async fn read_immutable_buf(
&self,
file_id: FileId,
blkno: u32,
ctx: &RequestContext,
) -> anyhow::Result<ReadBufResult> {
self.lock_for_read(&(CacheKey::ImmutableFilePage { file_id, blkno }), ctx)
.await
let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };
self.lock_for_read(&mut cache_key, ctx).await
}
//
@@ -368,11 +573,19 @@ impl PageCache {
/// Look up a page in the cache.
///
/// If the search criteria is not exact, *cache_key is updated with the key
/// for exact key of the returned page. (For materialized pages, that means
/// that the LSN in 'cache_key' is updated with the LSN of the returned page
/// version.)
///
/// If no page is found, returns None and *cache_key is left unmodified.
///
async fn try_lock_for_read(
&self,
cache_key: &CacheKey,
cache_key: &mut CacheKey,
permit: &mut Option<PinnedSlotsPermit>,
) -> Option<PageReadGuard> {
let cache_key_orig = cache_key.clone();
if let Some(slot_idx) = self.search_mapping(cache_key) {
// The page was found in the mapping. Lock the slot, and re-check
// that it's still what we expected (because we released the mapping
@@ -385,6 +598,9 @@ impl PageCache {
_permit: inner.coalesce_readers_permit(permit.take().unwrap()),
slot_guard: inner,
});
} else {
// search_mapping might have modified the search key; restore it.
*cache_key = cache_key_orig;
}
}
None
@@ -421,12 +637,15 @@ impl PageCache {
///
async fn lock_for_read(
&self,
cache_key: &CacheKey,
cache_key: &mut CacheKey,
ctx: &RequestContext,
) -> anyhow::Result<ReadBufResult> {
let mut permit = Some(self.try_get_pinned_slot_permit().await?);
let (read_access, hit) = match cache_key {
CacheKey::MaterializedPage { .. } => {
unreachable!("Materialized pages use lookup_materialized_page")
}
CacheKey::ImmutableFilePage { .. } => (
&crate::metrics::PAGE_CACHE
.for_ctx(ctx)
@@ -498,15 +717,52 @@ impl PageCache {
/// Search for a page in the cache using the given search key.
///
/// Returns the slot index, if any.
/// Returns the slot index, if any. If the search criteria is not exact,
/// *cache_key is updated with the actual key of the found page.
///
/// NOTE: We don't hold any lock on the mapping on return, so the slot might
/// get recycled for an unrelated page immediately after this function
/// returns. The caller is responsible for re-checking that the slot still
/// contains the page with the same key before using it.
///
fn search_mapping(&self, cache_key: &CacheKey) -> Option<usize> {
fn search_mapping(&self, cache_key: &mut CacheKey) -> Option<usize> {
match cache_key {
CacheKey::MaterializedPage { hash_key, lsn } => {
let map = self.materialized_page_map.read().unwrap();
let versions = map.get(hash_key)?;
let version_idx = match versions.binary_search_by_key(lsn, |v| v.lsn) {
Ok(version_idx) => version_idx,
Err(0) => return None,
Err(version_idx) => version_idx - 1,
};
let version = &versions[version_idx];
*lsn = version.lsn;
Some(version.slot_idx)
}
CacheKey::ImmutableFilePage { file_id, blkno } => {
let map = self.immutable_page_map.read().unwrap();
Some(*map.get(&(*file_id, *blkno))?)
}
}
}
/// Search for a page in the cache using the given search key.
///
/// Like 'search_mapping, but performs an "exact" search. Used for
/// allocating a new buffer.
fn search_mapping_exact(&self, key: &CacheKey) -> Option<usize> {
match key {
CacheKey::MaterializedPage { hash_key, lsn } => {
let map = self.materialized_page_map.read().unwrap();
let versions = map.get(hash_key)?;
if let Ok(version_idx) = versions.binary_search_by_key(lsn, |v| v.lsn) {
Some(versions[version_idx].slot_idx)
} else {
None
}
}
CacheKey::ImmutableFilePage { file_id, blkno } => {
let map = self.immutable_page_map.read().unwrap();
Some(*map.get(&(*file_id, *blkno))?)
@@ -519,6 +775,27 @@ impl PageCache {
///
fn remove_mapping(&self, old_key: &CacheKey) {
match old_key {
CacheKey::MaterializedPage {
hash_key: old_hash_key,
lsn: old_lsn,
} => {
let mut map = self.materialized_page_map.write().unwrap();
if let Entry::Occupied(mut old_entry) = map.entry(old_hash_key.clone()) {
let versions = old_entry.get_mut();
if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
versions.remove(version_idx);
self.size_metrics
.current_bytes_materialized_page
.sub_page_sz(1);
if versions.is_empty() {
old_entry.remove_entry();
}
}
} else {
panic!("could not find old key in mapping")
}
}
CacheKey::ImmutableFilePage { file_id, blkno } => {
let mut map = self.immutable_page_map.write().unwrap();
map.remove(&(*file_id, *blkno))
@@ -535,6 +812,30 @@ impl PageCache {
/// of the existing mapping and leaves it untouched.
fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
match new_key {
CacheKey::MaterializedPage {
hash_key: new_key,
lsn: new_lsn,
} => {
let mut map = self.materialized_page_map.write().unwrap();
let versions = map.entry(new_key.clone()).or_default();
match versions.binary_search_by_key(new_lsn, |v| v.lsn) {
Ok(version_idx) => Some(versions[version_idx].slot_idx),
Err(version_idx) => {
versions.insert(
version_idx,
Version {
lsn: *new_lsn,
slot_idx,
},
);
self.size_metrics
.current_bytes_materialized_page
.add_page_sz(1);
None
}
}
}
CacheKey::ImmutableFilePage { file_id, blkno } => {
let mut map = self.immutable_page_map.write().unwrap();
match map.entry((*file_id, *blkno)) {
@@ -648,6 +949,7 @@ impl PageCache {
let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
size_metrics.max_bytes.set_page_sz(num_pages);
size_metrics.current_bytes_immutable.set_page_sz(0);
size_metrics.current_bytes_materialized_page.set_page_sz(0);
let slots = page_buffer
.chunks_exact_mut(PAGE_SZ)
@@ -666,6 +968,7 @@ impl PageCache {
.collect();
Self {
materialized_page_map: Default::default(),
immutable_page_map: Default::default(),
slots,
next_evict_slot: AtomicUsize::new(0),

View File

@@ -3906,9 +3906,7 @@ pub(crate) mod harness {
let remote_fs_dir = conf.workdir.join("localfs");
std::fs::create_dir_all(&remote_fs_dir).unwrap();
let config = RemoteStorageConfig {
storage: RemoteStorageKind::LocalFs {
local_path: remote_fs_dir.clone(),
},
storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
};
let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();

View File

@@ -6,23 +6,25 @@ use pageserver_api::{models::TenantState, shard::TenantShardId};
use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
use tokio::sync::OwnedMutexGuard;
use tokio_util::sync::CancellationToken;
use tracing::{error, Instrument};
use tracing::{error, instrument, Instrument};
use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
use crate::{
config::PageServerConf,
context::RequestContext,
task_mgr::{self},
task_mgr::{self, TaskKind},
tenant::{
mgr::{TenantSlot, TenantsMapRemoveResult},
remote_timeline_client::remote_heatmap_path,
timeline::ShutdownMode,
},
};
use super::{
mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
span,
timeline::delete::DeleteTimelineFlow,
tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
};
@@ -32,6 +34,15 @@ pub(crate) enum DeleteTenantError {
#[error("GetTenant {0}")]
Get(#[from] GetTenantError),
#[error("Tenant not attached")]
NotAttached,
#[error("Invalid state {0}. Expected Active or Broken")]
InvalidState(TenantState),
#[error("Tenant deletion is already in progress")]
AlreadyInProgress,
#[error("Tenant map slot error {0}")]
SlotError(#[from] TenantSlotError),
@@ -63,6 +74,56 @@ fn remote_tenant_delete_mark_path(
Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
}
async fn create_remote_delete_mark(
conf: &PageServerConf,
remote_storage: &GenericRemoteStorage,
tenant_shard_id: &TenantShardId,
cancel: &CancellationToken,
) -> Result<(), DeleteTenantError> {
let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
let data: &[u8] = &[];
backoff::retry(
|| async {
let data = bytes::Bytes::from_static(data);
let stream = futures::stream::once(futures::future::ready(Ok(data)));
remote_storage
.upload(stream, 0, &remote_mark_path, None, cancel)
.await
},
TimeoutOrCancel::caused_by_cancel,
FAILED_UPLOAD_WARN_THRESHOLD,
FAILED_REMOTE_OP_RETRIES,
"mark_upload",
cancel,
)
.await
.ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
.and_then(|x| x)
.context("mark_upload")?;
Ok(())
}
async fn create_local_delete_mark(
conf: &PageServerConf,
tenant_shard_id: &TenantShardId,
) -> Result<(), DeleteTenantError> {
let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id);
// Note: we're ok to replace existing file.
let _ = std::fs::OpenOptions::new()
.write(true)
.create(true)
.truncate(true)
.open(&marker_path)
.with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
Ok(())
}
async fn schedule_ordered_timeline_deletions(
tenant: &Arc<Tenant>,
) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
@@ -201,6 +262,21 @@ async fn cleanup_remaining_fs_traces(
Ok(())
}
/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
/// and deletes its data from both disk and s3.
/// The sequence of steps:
/// 1. Upload remote deletion mark.
/// 2. Create local mark file.
/// 3. Shutdown tasks
/// 4. Run ordered timeline deletions
/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
/// 6. Remove remote mark
/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
/// It is resumable from any step in case a crash/restart occurs.
/// There are two entrypoints to the process:
/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
#[derive(Default)]
pub enum DeleteTenantFlow {
#[default]
@@ -210,6 +286,91 @@ pub enum DeleteTenantFlow {
}
impl DeleteTenantFlow {
// These steps are run in the context of management api request handler.
// Long running steps are continued to run in the background.
// NB: If this fails half-way through, and is retried, the retry will go through
// all the same steps again. Make sure the code here is idempotent, and don't
// error out if some of the shutdown tasks have already been completed!
// NOTE: static needed for background part.
// We assume that calling code sets up the span with tenant_id.
#[instrument(skip_all)]
pub(crate) async fn run(
conf: &'static PageServerConf,
remote_storage: GenericRemoteStorage,
tenants: &'static std::sync::RwLock<TenantsMap>,
tenant: Arc<Tenant>,
cancel: &CancellationToken,
) -> Result<(), DeleteTenantError> {
span::debug_assert_current_span_has_tenant_id();
pausable_failpoint!("tenant-delete-before-run");
let mut guard = Self::prepare(&tenant).await?;
if let Err(e) = Self::run_inner(&mut guard, conf, &remote_storage, &tenant, cancel).await {
tenant.set_broken(format!("{e:#}")).await;
return Err(e);
}
Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
Ok(())
}
// Helper function needed to be able to match once on returned error and transition tenant into broken state.
// This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
// will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
// So the solution is to set tenant state to broken.
async fn run_inner(
guard: &mut OwnedMutexGuard<Self>,
conf: &'static PageServerConf,
remote_storage: &GenericRemoteStorage,
tenant: &Tenant,
cancel: &CancellationToken,
) -> Result<(), DeleteTenantError> {
guard.mark_in_progress()?;
fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-create-remote-mark"
))?
});
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
.await
.context("remote_mark")?;
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-create-local-mark"
))?
});
create_local_delete_mark(conf, &tenant.tenant_shard_id)
.await
.context("local delete mark")?;
fail::fail_point!("tenant-delete-before-background", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-background"
))?
});
Ok(())
}
fn mark_in_progress(&mut self) -> anyhow::Result<()> {
match self {
Self::Finished => anyhow::bail!("Bug. Is in finished state"),
Self::InProgress { .. } => { /* We're in a retry */ }
Self::NotStarted => { /* Fresh start */ }
}
*self = Self::InProgress;
Ok(())
}
pub(crate) async fn should_resume_deletion(
conf: &'static PageServerConf,
remote_mark_exists: bool,
@@ -267,6 +428,79 @@ impl DeleteTenantFlow {
.await
}
/// Check whether background deletion of this tenant is currently in progress
pub(crate) fn is_in_progress(tenant: &Tenant) -> bool {
tenant.delete_progress.try_lock().is_err()
}
async fn prepare(
tenant: &Arc<Tenant>,
) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
// FIXME: unsure about active only. Our init jobs may not be cancellable properly,
// so at least for now allow deletions only for active tenants. TODO recheck
// Broken and Stopping is needed for retries.
if !matches!(
tenant.current_state(),
TenantState::Active | TenantState::Broken { .. }
) {
return Err(DeleteTenantError::InvalidState(tenant.current_state()));
}
let guard = Arc::clone(&tenant.delete_progress)
.try_lock_owned()
.map_err(|_| DeleteTenantError::AlreadyInProgress)?;
fail::fail_point!("tenant-delete-before-shutdown", |_| {
Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
});
// make pageserver shutdown not to wait for our completion
let (_, progress) = completion::channel();
// It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
// i e it is an error to do:
// tenant.set_stopping
// tenant.shutdown
// Its also bad that we're holding tenants.read here.
// TODO relax set_stopping to be idempotent?
if tenant.shutdown(progress, ShutdownMode::Hard).await.is_err() {
return Err(DeleteTenantError::Other(anyhow::anyhow!(
"tenant shutdown is already in progress"
)));
}
Ok(guard)
}
fn schedule_background(
guard: OwnedMutexGuard<Self>,
conf: &'static PageServerConf,
remote_storage: GenericRemoteStorage,
tenants: &'static std::sync::RwLock<TenantsMap>,
tenant: Arc<Tenant>,
) {
let tenant_shard_id = tenant.tenant_shard_id;
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::TimelineDeletionWorker,
Some(tenant_shard_id),
None,
"tenant_delete",
false,
async move {
if let Err(err) =
Self::background(guard, conf, remote_storage, tenants, &tenant).await
{
error!("Error: {err:#}");
tenant.set_broken(format!("{err:#}")).await;
};
Ok(())
}
.instrument(tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug())),
);
}
async fn background(
mut guard: OwnedMutexGuard<Self>,
conf: &PageServerConf,
@@ -346,6 +580,8 @@ impl DeleteTenantFlow {
.context("cleanup_remaining_fs_traces")?;
{
pausable_failpoint!("tenant-delete-before-map-remove");
// This block is simply removing the TenantSlot for this tenant. It requires a loop because
// we might conflict with a TenantSlot::InProgress marker and need to wait for it.
//

View File

@@ -3,6 +3,7 @@
use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
use futures::StreamExt;
use hyper::StatusCode;
use itertools::Itertools;
use pageserver_api::key::Key;
use pageserver_api::models::LocationConfigMode;
@@ -26,7 +27,8 @@ use tokio::task::JoinSet;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::{backoff, completion, crashsafe};
use remote_storage::GenericRemoteStorage;
use utils::{completion, crashsafe};
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
@@ -40,11 +42,12 @@ use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::{
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
};
use crate::tenant::delete::DeleteTenantFlow;
use crate::tenant::span::debug_assert_current_span_has_tenant_id;
use crate::tenant::storage_layer::inmemory_layer;
use crate::tenant::timeline::ShutdownMode;
use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};
use utils::crashsafe::path_with_suffix_extension;
use utils::fs_ext::PathExt;
@@ -419,6 +422,12 @@ fn load_tenant_config(
}
};
let tenant_ignore_mark_file = tenant_dir_path.join(IGNORED_TENANT_FILE_NAME);
if tenant_ignore_mark_file.exists() {
info!("Found an ignore mark file {tenant_ignore_mark_file:?}, skipping the tenant");
return Ok(None);
}
Ok(Some((
tenant_shard_id,
Tenant::load_tenant_config(conf, &tenant_shard_id),
@@ -704,6 +713,12 @@ fn tenant_spawn(
"Cannot load tenant from empty directory {tenant_path:?}"
);
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
anyhow::ensure!(
!conf.tenant_ignore_mark_file_path(&tenant_shard_id).exists(),
"Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
);
let remote_storage = resources.remote_storage.clone();
let tenant = match Tenant::spawn(
conf,
@@ -1052,7 +1067,7 @@ impl TenantManager {
// not do significant I/O, and shutdowns should be prompt via cancellation tokens.
let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)
.map_err(|e| match e {
TenantSlotError::NotFound(_) => {
TenantSlotError::AlreadyExists(_, _) | TenantSlotError::NotFound(_) => {
unreachable!("Called with mode Any")
}
TenantSlotError::InProgress => UpsertLocationError::InProgress,
@@ -1352,10 +1367,56 @@ impl TenantManager {
}
}
async fn delete_tenant_remote(
pub(crate) async fn delete_tenant(
&self,
tenant_shard_id: TenantShardId,
) -> Result<(), DeleteTenantError> {
activation_timeout: Duration,
) -> Result<StatusCode, DeleteTenantError> {
super::span::debug_assert_current_span_has_tenant_id();
// We acquire a SlotGuard during this function to protect against concurrent
// changes while the ::prepare phase of DeleteTenantFlow executes, but then
// have to return the Tenant to the map while the background deletion runs.
//
// TODO: refactor deletion to happen outside the lifetime of a Tenant.
// Currently, deletion requires a reference to the tenants map in order to
// keep the Tenant in the map until deletion is complete, and then remove
// it at the end.
//
// See https://github.com/neondatabase/neon/issues/5080
// Tenant deletion can happen two ways:
// - Legacy: called on an attached location. The attached Tenant object stays alive in Stopping
// state until deletion is complete.
// - New: called on a pageserver without an attached location. We proceed with deletion from
// remote storage.
//
// See https://github.com/neondatabase/neon/issues/5080 for more context on this transition.
let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
match &slot_guard.old_value {
Some(TenantSlot::Attached(tenant)) => {
// Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
// deletion will be resumed across restarts.
let tenant = tenant.clone();
return self
.delete_tenant_attached(slot_guard, tenant, activation_timeout)
.await;
}
Some(TenantSlot::Secondary(secondary_tenant)) => {
secondary_tenant.shutdown().await;
let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory)
.await
.with_context(|| {
format!("local tenant directory {local_tenant_directory:?} rename")
})?;
spawn_background_purge(tmp_dir);
}
Some(TenantSlot::InProgress(_)) => unreachable!(),
None => {}
};
// Fall through: local state for this tenant is no longer present, proceed with remote delete
let remote_path = remote_tenant_path(&tenant_shard_id);
let keys = match self
.resources
@@ -1372,7 +1433,7 @@ impl TenantManager {
Err(remote_storage::DownloadError::Cancelled) => {
return Err(DeleteTenantError::Cancelled)
}
Err(remote_storage::DownloadError::NotFound) => return Ok(()),
Err(remote_storage::DownloadError::NotFound) => return Ok(StatusCode::NOT_FOUND),
Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
};
@@ -1386,83 +1447,60 @@ impl TenantManager {
.await?;
}
Ok(())
// Callers use 404 as success for deletions, for historical reasons.
Ok(StatusCode::NOT_FOUND)
}
/// If a tenant is attached, detach it. Then remove its data from remote storage.
///
/// A tenant is considered deleted once it is gone from remote storage. It is the caller's
/// responsibility to avoid trying to attach the tenant again or use it any way once deletion
/// has started: this operation is not atomic, and must be retried until it succeeds.
pub(crate) async fn delete_tenant(
async fn delete_tenant_attached(
&self,
tenant_shard_id: TenantShardId,
) -> Result<(), DeleteTenantError> {
super::span::debug_assert_current_span_has_tenant_id();
async fn delete_local(
conf: &PageServerConf,
tenant_shard_id: &TenantShardId,
) -> anyhow::Result<()> {
let local_tenant_directory = conf.tenant_path(tenant_shard_id);
let tmp_dir = safe_rename_tenant_dir(&local_tenant_directory)
.await
.with_context(|| {
format!("local tenant directory {local_tenant_directory:?} rename")
})?;
spawn_background_purge(tmp_dir);
Ok(())
slot_guard: SlotGuard,
tenant: Arc<Tenant>,
activation_timeout: Duration,
) -> Result<StatusCode, DeleteTenantError> {
match tenant.current_state() {
TenantState::Broken { .. } | TenantState::Stopping { .. } => {
// If deletion is already in progress, return success (the semantics of this
// function are to rerturn success afterr deletion is spawned in background).
// Otherwise fall through and let [`DeleteTenantFlow`] handle this state.
if DeleteTenantFlow::is_in_progress(&tenant) {
// The `delete_progress` lock is held: deletion is already happening
// in the bacckground
slot_guard.revert();
return Ok(StatusCode::ACCEPTED);
}
}
_ => {
tenant
.wait_to_become_active(activation_timeout)
.await
.map_err(|e| match e {
GetActiveTenantError::WillNotBecomeActive(_)
| GetActiveTenantError::Broken(_) => {
DeleteTenantError::InvalidState(tenant.current_state())
}
GetActiveTenantError::Cancelled => DeleteTenantError::Cancelled,
GetActiveTenantError::NotFound(_) => DeleteTenantError::NotAttached,
GetActiveTenantError::WaitForActiveTimeout {
latest_state: _latest_state,
wait_time: _wait_time,
} => DeleteTenantError::InvalidState(tenant.current_state()),
})?;
}
}
let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
match &slot_guard.old_value {
Some(TenantSlot::Attached(tenant)) => {
// Legacy deletion flow: the tenant remains attached, goes to Stopping state, and
// deletion will be resumed across restarts.
let tenant = tenant.clone();
let (_guard, progress) = utils::completion::channel();
match tenant.shutdown(progress, ShutdownMode::Hard).await {
Ok(()) => {}
Err(barrier) => {
info!("Shutdown already in progress, waiting for it to complete");
barrier.wait().await;
}
}
delete_local(self.conf, &tenant_shard_id).await?;
}
Some(TenantSlot::Secondary(secondary_tenant)) => {
secondary_tenant.shutdown().await;
delete_local(self.conf, &tenant_shard_id).await?;
}
Some(TenantSlot::InProgress(_)) => unreachable!(),
None => {}
};
// Fall through: local state for this tenant is no longer present, proceed with remote delete.
// - We use a retry wrapper here so that common transient S3 errors (e.g. 503, 429) do not result
// in 500 responses to delete requests.
// - We keep the `SlotGuard` during this I/O, so that if a concurrent delete request comes in, it will
// 503/retry, rather than kicking off a wasteful concurrent deletion.
match backoff::retry(
|| async move { self.delete_tenant_remote(tenant_shard_id).await },
|e| match e {
DeleteTenantError::Cancelled => true,
DeleteTenantError::SlotError(_) => {
unreachable!("Remote deletion doesn't touch slots")
}
_ => false,
},
1,
3,
&format!("delete_tenant[tenant_shard_id={tenant_shard_id}]"),
let result = DeleteTenantFlow::run(
self.conf,
self.resources.remote_storage.clone(),
&TENANTS,
tenant,
&self.cancel,
)
.await
{
Some(r) => r,
None => Err(DeleteTenantError::Cancelled),
}
.await;
// The Tenant goes back into the map in Stopping state, it will eventually be removed by DeleteTenantFLow
slot_guard.revert();
let () = result?;
Ok(StatusCode::ACCEPTED)
}
#[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
@@ -1863,10 +1901,17 @@ impl TenantManager {
&self,
conf: &'static PageServerConf,
tenant_shard_id: TenantShardId,
detach_ignored: bool,
deletion_queue_client: &DeletionQueueClient,
) -> Result<(), TenantStateError> {
let tmp_path = self
.detach_tenant0(conf, &TENANTS, tenant_shard_id, deletion_queue_client)
.detach_tenant0(
conf,
&TENANTS,
tenant_shard_id,
detach_ignored,
deletion_queue_client,
)
.await?;
spawn_background_purge(tmp_path);
@@ -1878,6 +1923,7 @@ impl TenantManager {
conf: &'static PageServerConf,
tenants: &std::sync::RwLock<TenantsMap>,
tenant_shard_id: TenantShardId,
detach_ignored: bool,
deletion_queue_client: &DeletionQueueClient,
) -> Result<Utf8PathBuf, TenantStateError> {
let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
@@ -1900,6 +1946,26 @@ impl TenantManager {
// before this tenant is potentially re-attached elsewhere.
deletion_queue_client.flush_advisory();
// Ignored tenants are not present in memory and will bail the removal from memory operation.
// Before returning the error, check for ignored tenant removal case — we only need to clean its local files then.
if detach_ignored
&& matches!(
removal_result,
Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
)
{
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
if tenant_ignore_mark.exists() {
info!("Detaching an ignored tenant");
let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
.await
.with_context(|| {
format!("Ignored tenant {tenant_shard_id} local directory rename")
})?;
return Ok(tmp_path);
}
}
removal_result
}
@@ -2156,6 +2222,97 @@ pub(crate) enum TenantStateError {
Other(#[from] anyhow::Error),
}
pub(crate) async fn load_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
generation: Generation,
broker_client: storage_broker::BrokerClientChannel,
remote_storage: GenericRemoteStorage,
deletion_queue_client: DeletionQueueClient,
ctx: &RequestContext,
) -> Result<(), TenantMapInsertError> {
// This is a legacy API (replaced by `/location_conf`). It does not support sharding
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
let slot_guard =
tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
let tenant_path = conf.tenant_path(&tenant_shard_id);
let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
if tenant_ignore_mark.exists() {
std::fs::remove_file(&tenant_ignore_mark).with_context(|| {
format!(
"Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"
)
})?;
}
let resources = TenantSharedResources {
broker_client,
remote_storage,
deletion_queue_client,
};
let mut location_conf =
Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?;
location_conf.attach_in_generation(AttachmentMode::Single, generation);
Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
let shard_identity = location_conf.shard;
let new_tenant = tenant_spawn(
conf,
tenant_shard_id,
&tenant_path,
resources,
AttachedTenantConf::try_from(location_conf)?,
shard_identity,
None,
&TENANTS,
SpawnMode::Eager,
ctx,
)
.with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?;
slot_guard.upsert(TenantSlot::Attached(new_tenant))?;
Ok(())
}
pub(crate) async fn ignore_tenant(
conf: &'static PageServerConf,
tenant_id: TenantId,
) -> Result<(), TenantStateError> {
ignore_tenant0(conf, &TENANTS, tenant_id).await
}
#[instrument(skip_all, fields(shard_id))]
async fn ignore_tenant0(
conf: &'static PageServerConf,
tenants: &std::sync::RwLock<TenantsMap>,
tenant_id: TenantId,
) -> Result<(), TenantStateError> {
// This is a legacy API (replaced by `/location_conf`). It does not support sharding
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
tracing::Span::current().record(
"shard_id",
tracing::field::display(tenant_shard_id.shard_slug()),
);
remove_tenant_from_memory(tenants, tenant_shard_id, async {
let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
fs::File::create(&ignore_mark_file)
.await
.context("Failed to create ignore mark file")
.and_then(|_| {
crashsafe::fsync_file_and_parent(&ignore_mark_file)
.context("Failed to fsync ignore mark file")
})
.with_context(|| format!("Failed to crate ignore mark for tenant {tenant_shard_id}"))?;
Ok(())
})
.await
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum TenantMapListError {
#[error("tenant map is still initiailizing")]
@@ -2180,6 +2337,10 @@ pub(crate) enum TenantSlotError {
#[error("Tenant {0} not found")]
NotFound(TenantShardId),
/// When acquiring a slot with the expectation that the tenant does not already exist.
#[error("tenant {0} already exists, state: {1:?}")]
AlreadyExists(TenantShardId, TenantState),
// Tried to read a slot that is currently being mutated by another administrative
// operation.
#[error("tenant has a state change in progress, try again later")]
@@ -2495,6 +2656,8 @@ enum TenantSlotAcquireMode {
Any,
/// Return an error if trying to acquire a slot and it doesn't already exist
MustExist,
/// Return an error if trying to acquire a slot and it already exists
MustNotExist,
}
fn tenant_map_acquire_slot(
@@ -2548,6 +2711,27 @@ fn tenant_map_acquire_slot_impl(
tracing::debug!("Occupied, failing for InProgress");
Err(TenantSlotError::InProgress)
}
(slot, MustNotExist) => match slot {
TenantSlot::Attached(tenant) => {
tracing::debug!("Attached && MustNotExist, return AlreadyExists");
Err(TenantSlotError::AlreadyExists(
*tenant_shard_id,
tenant.current_state(),
))
}
_ => {
// FIXME: the AlreadyExists error assumes that we have a Tenant
// to get the state from
tracing::debug!("Occupied & MustNotExist, return AlreadyExists");
Err(TenantSlotError::AlreadyExists(
*tenant_shard_id,
TenantState::Broken {
reason: "Present but not attached".to_string(),
backtrace: "".to_string(),
},
))
}
},
_ => {
// Happy case: the slot was not in any state that violated our mode
let (completion, barrier) = utils::completion::channel();

View File

@@ -101,7 +101,9 @@ use crate::{
use crate::config::PageServerConf;
use crate::keyspace::{KeyPartitioning, KeySpace};
use crate::metrics::TimelineMetrics;
use crate::metrics::{
TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
};
use crate::pgdatadir_mapping::CalculateLogicalSizeError;
use crate::tenant::config::TenantConfOpt;
use pageserver_api::reltag::RelTag;
@@ -118,6 +120,7 @@ use utils::{
simple_rcu::{Rcu, RcuReadGuard},
};
use crate::page_cache;
use crate::repository::GcResult;
use crate::repository::{Key, Value};
use crate::task_mgr;
@@ -131,7 +134,7 @@ use self::layer_manager::LayerManager;
use self::logical_size::LogicalSize;
use self::walreceiver::{WalReceiver, WalReceiverConf};
use super::config::TenantConf;
use super::{config::TenantConf, storage_layer::VectoredValueReconstructState};
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
use super::{remote_timeline_client::RemoteTimelineClient, storage_layer::ReadableLayer};
@@ -884,11 +887,32 @@ impl Timeline {
self.timeline_get_throttle.throttle(ctx, 1).await;
// Check the page cache. We will get back the most recent page with lsn <= `lsn`.
// The cached image can be returned directly if there is no WAL between the cached image
// and requested LSN. The cached image can also be used to reduce the amount of WAL needed
// for redo.
let cached_page_img = match self.lookup_cached_page(&key, lsn, ctx).await {
Some((cached_lsn, cached_img)) => {
match cached_lsn.cmp(&lsn) {
Ordering::Less => {} // there might be WAL between cached_lsn and lsn, we need to check
Ordering::Equal => {
MATERIALIZED_PAGE_CACHE_HIT_DIRECT.inc();
return Ok(cached_img); // exact LSN match, return the image
}
Ordering::Greater => {
unreachable!("the returned lsn should never be after the requested lsn")
}
}
Some((cached_lsn, cached_img))
}
None => None,
};
match self.conf.get_impl {
GetImpl::Legacy => {
let reconstruct_state = ValueReconstructState {
records: Vec::new(),
img: None,
img: cached_page_img,
};
self.get_impl(key, lsn, reconstruct_state, ctx).await
@@ -902,6 +926,13 @@ impl Timeline {
// entry returned above.
let mut reconstruct_state = ValuesReconstructState::new();
// Only add the cached image to the reconstruct state when it exists.
if cached_page_img.is_some() {
let mut key_state = VectoredValueReconstructState::default();
key_state.img = cached_page_img;
reconstruct_state.keys.insert(key, Ok(key_state));
}
let vectored_res = self
.get_vectored_impl(keyspace.clone(), lsn, &mut reconstruct_state, ctx)
.await;
@@ -3209,6 +3240,7 @@ impl Timeline {
ValueReconstructResult::Continue => {
// If we reached an earlier cached page image, we're done.
if cont_lsn == cached_lsn + 1 {
MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
return Ok(traversal_path);
}
if let Some(prev) = prev_lsn {
@@ -3582,6 +3614,26 @@ impl Timeline {
})
}
/// # Cancel-safety
///
/// This method is cancellation-safe.
async fn lookup_cached_page(
&self,
key: &Key,
lsn: Lsn,
ctx: &RequestContext,
) -> Option<(Lsn, Bytes)> {
let cache = page_cache::get();
// FIXME: It's pointless to check the cache for things that are not 8kB pages.
// We should look at the key to determine if it's a cacheable object
let (lsn, read_guard) = cache
.lookup_materialized_page(self.tenant_shard_id, self.timeline_id, key, lsn, ctx)
.await?;
let img = Bytes::from(read_guard.to_vec());
Some((lsn, img))
}
async fn get_ready_ancestor_timeline(
&self,
ancestor: &Arc<Timeline>,
@@ -5228,6 +5280,8 @@ impl Timeline {
trace!("found {} WAL records that will init the page for {} at {}, performing WAL redo", data.records.len(), key, request_lsn);
};
let last_rec_lsn = data.records.last().unwrap().0;
let img = match self
.walredo_mgr
.as_ref()
@@ -5241,6 +5295,23 @@ impl Timeline {
Err(e) => return Err(PageReconstructError::WalRedo(e)),
};
if img.len() == page_cache::PAGE_SZ {
let cache = page_cache::get();
if let Err(e) = cache
.memorize_materialized_page(
self.tenant_shard_id,
self.timeline_id,
key,
last_rec_lsn,
&img,
)
.await
.context("Materialized page memoization failed")
{
return Err(PageReconstructError::from(e));
}
}
Ok(img)
}
}

View File

@@ -381,15 +381,6 @@ pageserver_connect(shardno_t shard_no, int elevel)
us_since_last_attempt = (int64) (now - shard->last_reconnect_time);
shard->last_reconnect_time = now;
/*
* Make sure we don't do exponential backoff with a constant multiplier
* of 0 us, as that doesn't really do much for timeouts...
*
* cf. https://github.com/neondatabase/neon/issues/7897
*/
if (shard->delay_us == 0)
shard->delay_us = MIN_RECONNECT_INTERVAL_USEC;
/*
* If we did other tasks between reconnect attempts, then we won't
* need to wait as long as a full delay.

View File

@@ -305,7 +305,7 @@ impl ProjectInfoCacheImpl {
// acquire a random shard lock
let mut removed = 0;
let shard = self.project2ep.shards()[shard].write();
for (_, endpoints) in shard.iter() {
for (_, endpoints) in crate::rawtable::iter(&*shard) {
for endpoint in endpoints.get().iter() {
self.cache.remove(endpoint);
removed += 1;

View File

@@ -517,11 +517,18 @@ impl<K: Hash + Eq + Clone> ApiLocks<K> {
);
let mut lock = shard.write();
let timer = self.metrics.reclamation_lag_seconds.start_timer();
let count = lock
.extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
.count();
let mut removed = 0;
crate::rawtable::retain(&mut *lock, |_, semaphore| {
let remove = Arc::strong_count(semaphore.get_mut()) == 1;
if remove {
removed += 1;
}
!remove
});
drop(lock);
self.metrics.semaphores_unregistered.inc_by(count as u64);
self.metrics.semaphores_unregistered.inc_by(removed as u64);
timer.observe();
}
}

View File

@@ -543,9 +543,7 @@ mod tests {
rx: impl Stream<Item = RequestData>,
) -> Vec<(u64, usize, i64)> {
let remote_storage_config = RemoteStorageConfig {
storage: RemoteStorageKind::LocalFs {
local_path: tmpdir.to_path_buf(),
},
storage: RemoteStorageKind::LocalFs(tmpdir.to_path_buf()),
timeout: std::time::Duration::from_secs(120),
};
let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap();

View File

@@ -25,6 +25,7 @@ pub mod parse;
pub mod protocol2;
pub mod proxy;
pub mod rate_limiter;
mod rawtable;
pub mod redis;
pub mod sasl;
pub mod scram;

View File

@@ -91,7 +91,7 @@ pub async fn task_main(
let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();
connections.spawn(async move {
let (socket, peer_addr) = match read_proxy_protocol(socket).await {
let (socket, peer_addr) = match read_proxy_protocol(socket).await{
Ok((socket, Some(addr))) => (socket, addr.ip()),
Err(e) => {
error!("per-client task finished with an error: {e:#}");
@@ -101,38 +101,36 @@ pub async fn task_main(
error!("missing required client IP");
return;
}
Ok((socket, None)) => (socket, peer_addr.ip()),
Ok((socket, None)) => (socket, peer_addr.ip())
};
match socket.inner.set_nodelay(true) {
Ok(()) => {}
Ok(()) => {},
Err(e) => {
error!("per-client task finished with an error: failed to set socket option: {e:#}");
return;
}
},
};
let mut ctx = RequestMonitoring::new(
session_id,
peer_addr,
crate::metrics::Protocol::Tcp,
&config.region,
);
session_id,
peer_addr,
crate::metrics::Protocol::Tcp,
&config.region,
);
let span = ctx.span.clone();
let startup = Box::pin(
handle_client(
config,
&mut ctx,
cancellation_handler,
socket,
ClientMode::Tcp,
endpoint_rate_limiter2,
conn_gauge,
)
.instrument(span.clone()),
);
let res = startup.await;
let res = handle_client(
config,
&mut ctx,
cancellation_handler,
socket,
ClientMode::Tcp,
endpoint_rate_limiter2,
conn_gauge,
)
.instrument(span.clone())
.await;
match res {
Err(e) => {

View File

@@ -98,7 +98,7 @@ pub(super) struct CopyBuffer {
amt: u64,
buf: Box<[u8]>,
}
const DEFAULT_BUF_SIZE: usize = 1024;
const DEFAULT_BUF_SIZE: usize = 8 * 1024;
impl CopyBuffer {
pub(super) fn new() -> Self {

61
proxy/src/rawtable.rs Normal file
View File

@@ -0,0 +1,61 @@
//! Dashmap moved to using RawTable for the shards.
//! Some of the APIs we used before are unsafe to access, but we can copy the implementations from the safe
//! HashMap wrappers for our needs.
// Safety info: All implementations here are taken directly from hashbrown HashMap impl.
use std::marker::PhantomData;
use hashbrown::raw;
// taken from https://docs.rs/hashbrown/0.14.5/src/hashbrown/map.rs.html#919-932
pub fn retain<K, V, F>(table: &mut raw::RawTable<(K, V)>, mut f: F)
where
F: FnMut(&K, &mut V) -> bool,
{
// SAFETY: Here we only use `iter` as a temporary, preventing use-after-free
unsafe {
for item in table.iter() {
let &mut (ref key, ref mut value) = item.as_mut();
if !f(key, value) {
table.erase(item);
}
}
}
}
// taken from https://docs.rs/hashbrown/0.14.5/src/hashbrown/map.rs.html#756-764
pub fn iter<K, V>(table: &raw::RawTable<(K, V)>) -> impl Iterator<Item = (&K, &V)> + '_ {
pub struct Iter<'a, K, V> {
inner: raw::RawIter<(K, V)>,
marker: PhantomData<(&'a K, &'a V)>,
}
impl<'a, K, V> Iterator for Iter<'a, K, V> {
type Item = (&'a K, &'a V);
#[cfg_attr(feature = "inline-more", inline)]
fn next(&mut self) -> Option<(&'a K, &'a V)> {
let x = self.inner.next()?;
// SAFETY: the borrows do not outlive the rawtable
unsafe {
let r = x.as_ref();
Some((&r.0, &r.1))
}
}
#[cfg_attr(feature = "inline-more", inline)]
fn size_hint(&self) -> (usize, Option<usize>) {
self.inner.size_hint()
}
}
// SAFETY:
// > It is up to the caller to ensure that the RawTable outlives the RawIter
// Here we tie the lifetime of self to the iter.
unsafe {
Iter {
inner: table.iter(),
marker: PhantomData,
}
}
}

View File

@@ -27,14 +27,14 @@ use rand::SeedableRng;
pub use reqwest_middleware::{ClientWithMiddleware, Error};
pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
use tokio::time::timeout;
use tokio_rustls::{server::TlsStream, TlsAcceptor};
use tokio_rustls::TlsAcceptor;
use tokio_util::task::TaskTracker;
use crate::cancellation::CancellationHandlerMain;
use crate::config::ProxyConfig;
use crate::context::RequestMonitoring;
use crate::metrics::Metrics;
use crate::protocol2::{read_proxy_protocol, ChainRW};
use crate::protocol2::read_proxy_protocol;
use crate::proxy::run_until_cancelled;
use crate::rate_limiter::EndpointRateLimiter;
use crate::serverless::backend::PoolingBackend;
@@ -102,6 +102,8 @@ pub async fn task_main(
let connections = tokio_util::task::task_tracker::TaskTracker::new();
connections.close(); // allows `connections.wait to complete`
let server = Builder::new(TokioExecutor::new());
while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await {
let (conn, peer_addr) = res.context("could not accept TCP stream")?;
if let Err(e) = conn.set_nodelay(true) {
@@ -125,50 +127,24 @@ pub async fn task_main(
}
let conn_token = cancellation_token.child_token();
let tls_acceptor = tls_acceptor.clone();
let backend = backend.clone();
let connections2 = connections.clone();
let cancellation_handler = cancellation_handler.clone();
let endpoint_rate_limiter = endpoint_rate_limiter.clone();
connections.spawn(
async move {
let conn_token2 = conn_token.clone();
let _cancel_guard = config.http_config.cancel_set.insert(conn_id, conn_token2);
let conn = connection_handler(
config,
backend.clone(),
connections.clone(),
cancellation_handler.clone(),
endpoint_rate_limiter.clone(),
conn_token.clone(),
server.clone(),
tls_acceptor.clone(),
conn,
peer_addr,
)
.instrument(http_conn_span);
let session_id = uuid::Uuid::new_v4();
let _gauge = Metrics::get()
.proxy
.client_connections
.guard(crate::metrics::Protocol::Http);
let startup_result = Box::pin(connection_startup(
config,
tls_acceptor,
session_id,
conn,
peer_addr,
))
.await;
let Some((conn, peer_addr)) = startup_result else {
return;
};
Box::pin(connection_handler(
config,
backend,
connections2,
cancellation_handler,
endpoint_rate_limiter,
conn_token,
conn,
peer_addr,
session_id,
))
.await;
}
.instrument(http_conn_span),
);
connections.spawn(async move {
let _cancel_guard = config.http_config.cancel_set.insert(conn_id, conn_token);
conn.await
});
}
connections.wait().await;
@@ -176,22 +152,40 @@ pub async fn task_main(
Ok(())
}
/// Handles the TCP startup lifecycle.
/// Handles the TCP lifecycle.
///
/// 1. Parses PROXY protocol V2
/// 2. Handles TLS handshake
async fn connection_startup(
config: &ProxyConfig,
/// 3. Handles HTTP connection
/// 1. With graceful shutdowns
/// 2. With graceful request cancellation with connection failure
/// 3. With websocket upgrade support.
#[allow(clippy::too_many_arguments)]
async fn connection_handler(
config: &'static ProxyConfig,
backend: Arc<PoolingBackend>,
connections: TaskTracker,
cancellation_handler: Arc<CancellationHandlerMain>,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
cancellation_token: CancellationToken,
server: Builder<TokioExecutor>,
tls_acceptor: TlsAcceptor,
session_id: uuid::Uuid,
conn: TcpStream,
peer_addr: SocketAddr,
) -> Option<(TlsStream<ChainRW<TcpStream>>, IpAddr)> {
) {
let session_id = uuid::Uuid::new_v4();
let _gauge = Metrics::get()
.proxy
.client_connections
.guard(crate::metrics::Protocol::Http);
// handle PROXY protocol
let (conn, peer) = match read_proxy_protocol(conn).await {
Ok(c) => c,
Err(e) => {
tracing::error!(?session_id, %peer_addr, "failed to accept TCP connection: invalid PROXY protocol V2 header: {e:#}");
return None;
return;
}
};
@@ -214,7 +208,7 @@ async fn connection_startup(
Metrics::get().proxy.tls_handshake_failures.inc();
}
warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
return None;
return;
}
// The handshake timed out
Err(e) => {
@@ -222,36 +216,16 @@ async fn connection_startup(
Metrics::get().proxy.tls_handshake_failures.inc();
}
warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
return None;
return;
}
};
Some((conn, peer_addr))
}
/// Handles HTTP connection
/// 1. With graceful shutdowns
/// 2. With graceful request cancellation with connection failure
/// 3. With websocket upgrade support.
#[allow(clippy::too_many_arguments)]
async fn connection_handler(
config: &'static ProxyConfig,
backend: Arc<PoolingBackend>,
connections: TaskTracker,
cancellation_handler: Arc<CancellationHandlerMain>,
endpoint_rate_limiter: Arc<EndpointRateLimiter>,
cancellation_token: CancellationToken,
conn: TlsStream<ChainRW<TcpStream>>,
peer_addr: IpAddr,
session_id: uuid::Uuid,
) {
let session_id = AtomicTake::new(session_id);
// Cancel all current inflight HTTP requests if the HTTP connection is closed.
let http_cancellation_token = CancellationToken::new();
let _cancel_connection = http_cancellation_token.clone().drop_guard();
let server = Builder::new(TokioExecutor::new());
let conn = server.serve_connection_with_upgrades(
hyper_util::rt::TokioIo::new(conn),
hyper1::service::service_fn(move |req: hyper1::Request<Incoming>| {

View File

@@ -104,7 +104,7 @@ impl PoolingBackend {
) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
let maybe_client = if !force_new {
info!("pool: looking for an existing connection");
self.pool.get(ctx, &conn_info)?
self.pool.get(ctx, &conn_info).await?
} else {
info!("pool: pool is disabled");
None

View File

@@ -324,7 +324,8 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
.start_timer();
let current_len = shard.len();
let mut clients_removed = 0;
shard.retain(|endpoint, x| {
crate::rawtable::retain(&mut *shard, |endpoint, x| {
// if the current endpoint pool is unique (no other strong or weak references)
// then it is currently not in use by any connections.
if let Some(pool) = Arc::get_mut(x.get_mut()) {
@@ -375,7 +376,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
}
}
pub fn get(
pub async fn get(
self: &Arc<Self>,
ctx: &mut RequestMonitoring,
conn_info: &ConnInfo,

View File

@@ -533,31 +533,27 @@ async fn handle_inner(
return Err(SqlOverHttpError::RequestTooLarge);
}
let fetch_and_process_request = Box::pin(
async {
let body = request.into_body().collect().await?.to_bytes();
info!(length = body.len(), "request payload read");
let payload: Payload = serde_json::from_slice(&body)?;
Ok::<Payload, ReadPayloadError>(payload) // Adjust error type accordingly
}
.map_err(SqlOverHttpError::from),
);
let fetch_and_process_request = async {
let body = request.into_body().collect().await?.to_bytes();
info!(length = body.len(), "request payload read");
let payload: Payload = serde_json::from_slice(&body)?;
Ok::<Payload, ReadPayloadError>(payload) // Adjust error type accordingly
}
.map_err(SqlOverHttpError::from);
let authenticate_and_connect = Box::pin(
async {
let keys = backend
.authenticate(ctx, &config.authentication_config, &conn_info)
.await?;
let client = backend
.connect_to_compute(ctx, conn_info, keys, !allow_pool)
.await?;
// not strictly necessary to mark success here,
// but it's just insurance for if we forget it somewhere else
ctx.latency_timer.success();
Ok::<_, HttpConnError>(client)
}
.map_err(SqlOverHttpError::from),
);
let authenticate_and_connect = async {
let keys = backend
.authenticate(ctx, &config.authentication_config, &conn_info)
.await?;
let client = backend
.connect_to_compute(ctx, conn_info, keys, !allow_pool)
.await?;
// not strictly necessary to mark success here,
// but it's just insurance for if we forget it somewhere else
ctx.latency_timer.success();
Ok::<_, HttpConnError>(client)
}
.map_err(SqlOverHttpError::from);
let (payload, mut client) = match run_until_cancelled(
// Run both operations in parallel

View File

@@ -141,7 +141,7 @@ pub async fn serve_websocket(
.client_connections
.guard(crate::metrics::Protocol::Ws);
let res = Box::pin(handle_client(
let res = handle_client(
config,
&mut ctx,
cancellation_handler,
@@ -149,7 +149,7 @@ pub async fn serve_websocket(
ClientMode::Websockets { hostname },
endpoint_rate_limiter,
conn_gauge,
))
)
.await;
match res {

View File

@@ -12,16 +12,15 @@ use std::ops::Deref;
use std::path::Path;
use std::time::Instant;
use crate::control_file_upgrade::downgrade_v9_to_v8;
use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
use crate::state::{EvictionState, TimelinePersistentState};
use crate::state::TimelinePersistentState;
use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir};
use utils::{bin_ser::LeSer, id::TenantTimelineId};
use crate::SafeKeeperConf;
pub const SK_MAGIC: u32 = 0xcafeceefu32;
pub const SK_FORMAT_VERSION: u32 = 9;
pub const SK_FORMAT_VERSION: u32 = 8;
// contains persistent metadata for safekeeper
pub const CONTROL_FILE_NAME: &str = "safekeeper.control";
@@ -179,18 +178,8 @@ impl Storage for FileStorage {
})?;
let mut buf: Vec<u8> = Vec::new();
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
if s.eviction_state == EvictionState::Present {
// temp hack for forward compatibility
const PREV_FORMAT_VERSION: u32 = 8;
let prev = downgrade_v9_to_v8(s);
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, PREV_FORMAT_VERSION)?;
prev.ser_into(&mut buf)?;
} else {
// otherwise, we write the current format version
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
s.ser_into(&mut buf)?;
}
WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_FORMAT_VERSION)?;
s.ser_into(&mut buf)?;
// calculate checksum before resize
let checksum = crc32c::crc32c(&buf);

View File

@@ -1,7 +1,7 @@
//! Code to deal with safekeeper control file upgrades
use crate::{
safekeeper::{AcceptorState, PgUuid, ServerInfo, Term, TermHistory, TermLsn},
state::{EvictionState, PersistedPeers, TimelinePersistentState},
state::{PersistedPeers, TimelinePersistentState},
wal_backup_partial,
};
use anyhow::{bail, Result};
@@ -183,55 +183,6 @@ pub struct SafeKeeperStateV7 {
pub peers: PersistedPeers,
}
/// Persistent information stored on safekeeper node about timeline.
/// On disk data is prefixed by magic and format version and followed by checksum.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct SafeKeeperStateV8 {
#[serde(with = "hex")]
pub tenant_id: TenantId,
#[serde(with = "hex")]
pub timeline_id: TimelineId,
/// persistent acceptor state
pub acceptor_state: AcceptorState,
/// information about server
pub server: ServerInfo,
/// Unique id of the last *elected* proposer we dealt with. Not needed
/// for correctness, exists for monitoring purposes.
#[serde(with = "hex")]
pub proposer_uuid: PgUuid,
/// Since which LSN this timeline generally starts. Safekeeper might have
/// joined later.
pub timeline_start_lsn: Lsn,
/// Since which LSN safekeeper has (had) WAL for this timeline.
/// All WAL segments next to one containing local_start_lsn are
/// filled with data from the beginning.
pub local_start_lsn: Lsn,
/// Part of WAL acknowledged by quorum *and available locally*. Always points
/// to record boundary.
pub commit_lsn: Lsn,
/// LSN that points to the end of the last backed up segment. Useful to
/// persist to avoid finding out offloading progress on boot.
pub backup_lsn: Lsn,
/// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
/// of last record streamed to everyone). Persisting it helps skipping
/// recovery in walproposer, generally we compute it from peers. In
/// walproposer proto called 'truncate_lsn'. Updates are currently drived
/// only by walproposer.
pub peer_horizon_lsn: Lsn,
/// LSN of the oldest known checkpoint made by pageserver and successfully
/// pushed to s3. We don't remove WAL beyond it. Persisted only for
/// informational purposes, we receive it from pageserver (or broker).
pub remote_consistent_lsn: Lsn,
/// Peers and their state as we remember it. Knowing peers themselves is
/// fundamental; but state is saved here only for informational purposes and
/// obviously can be stale. (Currently not saved at all, but let's provision
/// place to have less file version upgrades).
pub peers: PersistedPeers,
/// Holds names of partial segments uploaded to remote storage. Used to
/// clean up old objects without leaving garbage in remote storage.
pub partial_backup: wal_backup_partial::State,
}
pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersistentState> {
// migrate to storing full term history
if version == 1 {
@@ -262,7 +213,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
remote_consistent_lsn: Lsn(0),
peers: PersistedPeers(vec![]),
partial_backup: wal_backup_partial::State::default(),
eviction_state: EvictionState::Present,
});
// migrate to hexing some ids
} else if version == 2 {
@@ -287,7 +237,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
remote_consistent_lsn: Lsn(0),
peers: PersistedPeers(vec![]),
partial_backup: wal_backup_partial::State::default(),
eviction_state: EvictionState::Present,
});
// migrate to moving tenant_id/timeline_id to the top and adding some lsns
} else if version == 3 {
@@ -312,7 +261,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
remote_consistent_lsn: Lsn(0),
peers: PersistedPeers(vec![]),
partial_backup: wal_backup_partial::State::default(),
eviction_state: EvictionState::Present,
});
// migrate to having timeline_start_lsn
} else if version == 4 {
@@ -337,7 +285,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
remote_consistent_lsn: Lsn(0),
peers: PersistedPeers(vec![]),
partial_backup: wal_backup_partial::State::default(),
eviction_state: EvictionState::Present,
});
} else if version == 5 {
info!("reading safekeeper control file version {}", version);
@@ -382,26 +329,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
remote_consistent_lsn: oldstate.remote_consistent_lsn,
peers: oldstate.peers,
partial_backup: wal_backup_partial::State::default(),
eviction_state: EvictionState::Present,
});
} else if version == 8 {
let oldstate = SafeKeeperStateV8::des(&buf[..buf.len()])?;
return Ok(TimelinePersistentState {
tenant_id: oldstate.tenant_id,
timeline_id: oldstate.timeline_id,
acceptor_state: oldstate.acceptor_state,
server: oldstate.server,
proposer_uuid: oldstate.proposer_uuid,
timeline_start_lsn: oldstate.timeline_start_lsn,
local_start_lsn: oldstate.local_start_lsn,
commit_lsn: oldstate.commit_lsn,
backup_lsn: oldstate.backup_lsn,
peer_horizon_lsn: oldstate.peer_horizon_lsn,
remote_consistent_lsn: oldstate.remote_consistent_lsn,
peers: oldstate.peers,
partial_backup: oldstate.partial_backup,
eviction_state: EvictionState::Present,
});
}
@@ -411,25 +338,6 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
bail!("unsupported safekeeper control file version {}", version)
}
pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8 {
assert!(state.eviction_state == EvictionState::Present);
SafeKeeperStateV8 {
tenant_id: state.tenant_id,
timeline_id: state.timeline_id,
acceptor_state: state.acceptor_state.clone(),
server: state.server.clone(),
proposer_uuid: state.proposer_uuid,
timeline_start_lsn: state.timeline_start_lsn,
local_start_lsn: state.local_start_lsn,
commit_lsn: state.commit_lsn,
backup_lsn: state.backup_lsn,
peer_horizon_lsn: state.peer_horizon_lsn,
remote_consistent_lsn: state.remote_consistent_lsn,
peers: state.peers.clone(),
partial_backup: state.partial_backup.clone(),
}
}
#[cfg(test)]
mod tests {
use std::str::FromStr;

View File

@@ -958,7 +958,7 @@ mod tests {
use super::*;
use crate::{
state::{EvictionState, PersistedPeers, TimelinePersistentState},
state::{PersistedPeers, TimelinePersistentState},
wal_storage::Storage,
};
use std::{ops::Deref, str::FromStr, time::Instant};
@@ -1225,7 +1225,6 @@ mod tests {
},
)]),
partial_backup: crate::wal_backup_partial::State::default(),
eviction_state: EvictionState::Present,
};
let ser = state.ser().unwrap();
@@ -1273,8 +1272,6 @@ mod tests {
0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
// partial_backup
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
// eviction_state
0x00, 0x00, 0x00, 0x00,
];
assert_eq!(Hex(&ser), Hex(&expected));

View File

@@ -63,26 +63,11 @@ pub struct TimelinePersistentState {
/// Holds names of partial segments uploaded to remote storage. Used to
/// clean up old objects without leaving garbage in remote storage.
pub partial_backup: wal_backup_partial::State,
/// Eviction state of the timeline. If it's Offloaded, we should download
/// WAL files from remote storage to serve the timeline.
pub eviction_state: EvictionState,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);
/// State of the local WAL files. Used to track current timeline state,
/// that can be either WAL files are present on disk or last partial segment
/// is offloaded to remote storage.
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
pub enum EvictionState {
/// WAL files are present on disk.
Present,
/// Last partial segment is offloaded to remote storage.
/// Contains flush_lsn of the last offloaded segment.
Offloaded(Lsn),
}
impl TimelinePersistentState {
pub fn new(
ttid: &TenantTimelineId,
@@ -113,7 +98,6 @@ impl TimelinePersistentState {
.collect(),
),
partial_backup: wal_backup_partial::State::default(),
eviction_state: EvictionState::Present,
}
}

View File

@@ -40,7 +40,6 @@ tokio.workspace = true
tokio-util.workspace = true
tracing.workspace = true
measured.workspace = true
scopeguard.workspace = true
strum.workspace = true
strum_macros.workspace = true

View File

@@ -1,59 +0,0 @@
use std::{borrow::Cow, fmt::Debug, fmt::Display};
use tokio_util::sync::CancellationToken;
use utils::id::NodeId;
pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 10;
#[derive(Copy, Clone)]
pub(crate) struct Drain {
pub(crate) node_id: NodeId,
}
#[derive(Copy, Clone)]
pub(crate) struct Fill {
pub(crate) node_id: NodeId,
}
#[derive(Copy, Clone)]
pub(crate) enum Operation {
Drain(Drain),
Fill(Fill),
}
#[derive(Debug, thiserror::Error)]
pub(crate) enum OperationError {
#[error("Node state changed during operation: {0}")]
NodeStateChanged(Cow<'static, str>),
#[error("Operation finalize error: {0}")]
FinalizeError(Cow<'static, str>),
#[error("Operation cancelled")]
Cancelled,
}
pub(crate) struct OperationHandler {
pub(crate) operation: Operation,
#[allow(unused)]
pub(crate) cancel: CancellationToken,
}
impl Display for Drain {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "drain {}", self.node_id)
}
}
impl Display for Fill {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
write!(f, "fill {}", self.node_id)
}
}
impl Display for Operation {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
Operation::Drain(op) => write!(f, "{op}"),
Operation::Fill(op) => write!(f, "{op}"),
}
}
}

View File

@@ -146,9 +146,6 @@ pub(crate) enum NotifyError {
// A response indicates we will never succeed, such as 400 or 404
#[error("Non-retryable error {0}")]
Fatal(StatusCode),
#[error("neon_local error: {0}")]
NeonLocal(anyhow::Error),
}
enum MaybeSendResult {
@@ -281,18 +278,12 @@ impl ComputeHook {
async fn do_notify_local(
&self,
reconfigure_request: &ComputeHookNotifyRequest,
) -> Result<(), NotifyError> {
) -> anyhow::Result<()> {
// neon_local updates are not safe to call concurrently, use a lock to serialize
// all calls to this function
let _locked = self.neon_local_lock.lock().await;
let Some(repo_dir) = self.config.neon_local_repo_dir.as_deref() else {
tracing::warn!(
"neon_local_repo_dir not set, likely a bug in neon_local; skipping compute update"
);
return Ok(());
};
let env = match LocalEnv::load_config(repo_dir) {
let env = match LocalEnv::load_config() {
Ok(e) => e,
Err(e) => {
tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})");
@@ -324,8 +315,7 @@ impl ComputeHook {
tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
endpoint
.reconfigure(compute_pageservers.clone(), *stripe_size)
.await
.map_err(NotifyError::NeonLocal)?;
.await?;
}
}
@@ -514,7 +504,7 @@ impl ComputeHook {
} else {
self.do_notify_local(&request).await.map_err(|e| {
// This path is for testing only, so munge the error into our prod-style error type.
tracing::error!("neon_local notification hook failed: {e}");
tracing::error!("Local notification hook failed: {e}");
NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
})
};

View File

@@ -480,61 +480,6 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
)
}
async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
let node_id: NodeId = parse_request_param(&req, "node_id")?;
let node_status = state.service.get_node(node_id).await?;
json_response(StatusCode::OK, node_status)
}
async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
let node_id: NodeId = parse_request_param(&req, "node_id")?;
state.service.start_node_drain(node_id).await?;
json_response(StatusCode::ACCEPTED, ())
}
async fn handle_cancel_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
let node_id: NodeId = parse_request_param(&req, "node_id")?;
state.service.cancel_node_drain(node_id).await?;
json_response(StatusCode::ACCEPTED, ())
}
async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
let node_id: NodeId = parse_request_param(&req, "node_id")?;
state.service.start_node_fill(node_id).await?;
json_response(StatusCode::ACCEPTED, ())
}
async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::Admin)?;
let state = get_state(&req);
let node_id: NodeId = parse_request_param(&req, "node_id")?;
state.service.cancel_node_fill(node_id).await?;
json_response(StatusCode::ACCEPTED, ())
}
async fn handle_tenant_shard_split(
service: Arc<Service>,
mut req: Request<Body>,
@@ -887,30 +832,6 @@ pub fn make_router(
RequestName("control_v1_node_config"),
)
})
.get("/control/v1/node/:node_id", |r| {
named_request_span(r, handle_node_status, RequestName("control_v1_node_status"))
})
.put("/control/v1/node/:node_id/drain", |r| {
named_request_span(r, handle_node_drain, RequestName("control_v1_node_drain"))
})
.delete("/control/v1/node/:node_id/drain", |r| {
named_request_span(
r,
handle_cancel_node_drain,
RequestName("control_v1_cancel_node_drain"),
)
})
.put("/control/v1/node/:node_id/fill", |r| {
named_request_span(r, handle_node_fill, RequestName("control_v1_node_fill"))
})
.delete("/control/v1/node/:node_id/fill", |r| {
named_request_span(
r,
handle_cancel_node_fill,
RequestName("control_v1_cancel_node_fill"),
)
})
// TODO(vlad): endpoint for cancelling drain and fill
// Tenant Shard operations
.put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
tenant_service_handler(

View File

@@ -8,15 +8,14 @@ use crate::service::RECONCILE_TIMEOUT;
const LOCK_TIMEOUT_ALERT_THRESHOLD: Duration = RECONCILE_TIMEOUT;
/// A wrapper around `OwnedRwLockWriteGuard` used for tracking the
/// operation that holds the lock, and print a warning if it exceeds
/// the LOCK_TIMEOUT_ALERT_THRESHOLD time
pub struct TracingExclusiveGuard<T: Display> {
/// A wrapper around `OwnedRwLockWriteGuard` that when dropped changes the
/// current holding operation in lock.
pub struct WrappedWriteGuard<T: Display> {
guard: tokio::sync::OwnedRwLockWriteGuard<Option<T>>,
start: Instant,
}
impl<T: Display> TracingExclusiveGuard<T> {
impl<T: Display> WrappedWriteGuard<T> {
pub fn new(guard: tokio::sync::OwnedRwLockWriteGuard<Option<T>>) -> Self {
Self {
guard,
@@ -25,12 +24,12 @@ impl<T: Display> TracingExclusiveGuard<T> {
}
}
impl<T: Display> Drop for TracingExclusiveGuard<T> {
impl<T: Display> Drop for WrappedWriteGuard<T> {
fn drop(&mut self) {
let duration = self.start.elapsed();
if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
tracing::warn!(
"Exclusive lock by {} was held for {:?}",
"Lock on {} was held for {:?}",
self.guard.as_ref().unwrap(),
duration
);
@@ -39,38 +38,6 @@ impl<T: Display> Drop for TracingExclusiveGuard<T> {
}
}
// A wrapper around `OwnedRwLockReadGuard` used for tracking the
/// operation that holds the lock, and print a warning if it exceeds
/// the LOCK_TIMEOUT_ALERT_THRESHOLD time
pub struct TracingSharedGuard<T: Display> {
_guard: tokio::sync::OwnedRwLockReadGuard<Option<T>>,
operation: T,
start: Instant,
}
impl<T: Display> TracingSharedGuard<T> {
pub fn new(guard: tokio::sync::OwnedRwLockReadGuard<Option<T>>, operation: T) -> Self {
Self {
_guard: guard,
operation,
start: Instant::now(),
}
}
}
impl<T: Display> Drop for TracingSharedGuard<T> {
fn drop(&mut self) {
let duration = self.start.elapsed();
if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
tracing::warn!(
"Shared lock by {} was held for {:?}",
self.operation,
duration
);
}
}
}
/// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
/// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
/// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
@@ -91,22 +58,21 @@ where
pub(crate) fn shared(
&self,
key: T,
operation: I,
) -> impl std::future::Future<Output = TracingSharedGuard<I>> {
) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<Option<I>>> {
let mut locked = self.entities.lock().unwrap();
let entry = locked.entry(key).or_default().clone();
async move { TracingSharedGuard::new(entry.read_owned().await, operation) }
let entry = locked.entry(key).or_default();
entry.clone().read_owned()
}
pub(crate) fn exclusive(
&self,
key: T,
operation: I,
) -> impl std::future::Future<Output = TracingExclusiveGuard<I>> {
) -> impl std::future::Future<Output = WrappedWriteGuard<I>> {
let mut locked = self.entities.lock().unwrap();
let entry = locked.entry(key).or_default().clone();
async move {
let mut guard = TracingExclusiveGuard::new(entry.write_owned().await);
let mut guard = WrappedWriteGuard::new(entry.clone().write_owned().await);
*guard.guard = Some(operation);
guard
}
@@ -133,12 +99,12 @@ where
pub async fn trace_exclusive_lock<
T: Clone + Display + Eq + PartialEq + std::hash::Hash,
I: Clone + Display,
I: Display + Clone,
>(
op_locks: &IdLockMap<T, I>,
key: T,
operation: I,
) -> TracingExclusiveGuard<I> {
) -> WrappedWriteGuard<I> {
let start = Instant::now();
let guard = op_locks.exclusive(key.clone(), operation.clone()).await;
@@ -157,14 +123,14 @@ pub async fn trace_exclusive_lock<
pub async fn trace_shared_lock<
T: Clone + Display + Eq + PartialEq + std::hash::Hash,
I: Clone + Display,
I: Display,
>(
op_locks: &IdLockMap<T, I>,
key: T,
operation: I,
) -> TracingSharedGuard<I> {
) -> tokio::sync::OwnedRwLockReadGuard<Option<I>> {
let start = Instant::now();
let guard = op_locks.shared(key.clone(), operation.clone()).await;
let guard = op_locks.shared(key.clone()).await;
let duration = start.elapsed();
if duration > LOCK_TIMEOUT_ALERT_THRESHOLD {
@@ -193,11 +159,11 @@ mod tests {
async fn multiple_shared_locks() {
let id_lock_map: IdLockMap<i32, Operations> = IdLockMap::default();
let shared_lock_1 = id_lock_map.shared(1, Operations::Op1).await;
let shared_lock_2 = id_lock_map.shared(1, Operations::Op2).await;
let shared_lock_1 = id_lock_map.shared(1).await;
let shared_lock_2 = id_lock_map.shared(1).await;
assert_eq!(shared_lock_1.operation, Operations::Op1);
assert_eq!(shared_lock_2.operation, Operations::Op2);
assert!(shared_lock_1.is_none());
assert!(shared_lock_2.is_none());
}
#[tokio::test]
@@ -217,7 +183,7 @@ mod tests {
assert!(_ex_lock_2.is_err());
}
let shared_lock_1 = id_lock_map.shared(resource_id, Operations::Op1).await;
assert_eq!(shared_lock_1.operation, Operations::Op1);
let shared_lock_1 = id_lock_map.shared(resource_id).await;
assert!(shared_lock_1.is_none());
}
}

View File

@@ -2,7 +2,6 @@ use serde::Serialize;
use utils::seqwait::MonotonicCounter;
mod auth;
mod background_node_operations;
mod compute_hook;
mod heartbeater;
pub mod http;

View File

@@ -4,7 +4,6 @@ use clap::Parser;
use diesel::Connection;
use metrics::launch_timestamp::LaunchTimestamp;
use metrics::BuildInfo;
use std::path::PathBuf;
use std::sync::Arc;
use storage_controller::http::make_router;
use storage_controller::metrics::preinitialize_metrics;
@@ -78,12 +77,6 @@ struct Cli {
/// How long to wait for the initial database connection to be available.
#[arg(long, default_value = "5s")]
db_connect_timeout: humantime::Duration,
/// `neon_local` sets this to the path of the neon_local repo dir.
/// Only relevant for testing.
// TODO: make `cfg(feature = "testing")`
#[arg(long)]
neon_local_repo_dir: Option<PathBuf>,
}
enum StrictMode {
@@ -267,7 +260,6 @@ async fn async_main() -> anyhow::Result<()> {
.reconciler_concurrency
.unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
split_threshold: args.split_threshold,
neon_local_repo_dir: args.neon_local_repo_dir,
};
// After loading secrets & config, but before starting anything else, apply database migrations

View File

@@ -59,10 +59,6 @@ impl Node {
self.id
}
pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy {
self.scheduling
}
pub(crate) fn set_scheduling(&mut self, scheduling: NodeSchedulingPolicy) {
self.scheduling = scheduling
}
@@ -155,7 +151,6 @@ impl Node {
NodeSchedulingPolicy::Draining => MaySchedule::No,
NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
NodeSchedulingPolicy::Pause => MaySchedule::No,
NodeSchedulingPolicy::PauseForRestart => MaySchedule::No,
}
}
@@ -172,7 +167,7 @@ impl Node {
listen_http_port,
listen_pg_addr,
listen_pg_port,
scheduling: NodeSchedulingPolicy::Active,
scheduling: NodeSchedulingPolicy::Filling,
availability: NodeAvailability::Offline,
cancel: CancellationToken::new(),
}

View File

@@ -442,15 +442,13 @@ impl Persistence {
#[tracing::instrument(skip_all, fields(node_id))]
pub(crate) async fn re_attach(
&self,
input_node_id: NodeId,
node_id: NodeId,
) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
use crate::schema::nodes::dsl::scheduling_policy;
use crate::schema::nodes::dsl::*;
use crate::schema::tenant_shards::dsl::*;
let updated = self
.with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
let rows_updated = diesel::update(tenant_shards)
.filter(generation_pageserver.eq(input_node_id.0 as i64))
.filter(generation_pageserver.eq(node_id.0 as i64))
.set(generation.eq(generation + 1))
.execute(conn)?;
@@ -459,23 +457,9 @@ impl Persistence {
// TODO: UPDATE+SELECT in one query
let updated = tenant_shards
.filter(generation_pageserver.eq(input_node_id.0 as i64))
.filter(generation_pageserver.eq(node_id.0 as i64))
.select(TenantShardPersistence::as_select())
.load(conn)?;
// If the node went through a drain and restart phase before re-attaching,
// then reset it's node scheduling policy to active.
diesel::update(nodes)
.filter(node_id.eq(input_node_id.0 as i64))
.filter(
scheduling_policy
.eq(String::from(NodeSchedulingPolicy::PauseForRestart))
.or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Draining)))
.or(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Filling))),
)
.set(scheduling_policy.eq(String::from(NodeSchedulingPolicy::Active)))
.execute(conn)?;
Ok(updated)
})
.await?;

View File

@@ -1,5 +1,4 @@
use crate::{node::Node, tenant_shard::TenantShard};
use itertools::Itertools;
use pageserver_api::controller_api::UtilizationScore;
use serde::Serialize;
use std::collections::HashMap;
@@ -284,44 +283,6 @@ impl Scheduler {
}
}
// Check if the number of shards attached to a given node is lagging below
// the cluster average. If that's the case, the node should be filled.
pub(crate) fn compute_fill_requirement(&self, node_id: NodeId) -> usize {
let Some(node) = self.nodes.get(&node_id) else {
debug_assert!(false);
tracing::error!("Scheduler missing node {node_id}");
return 0;
};
assert!(!self.nodes.is_empty());
let expected_attached_shards_per_node = self.expected_attached_shard_count();
for (node_id, node) in self.nodes.iter() {
tracing::trace!(%node_id, "attached_shard_count={} shard_count={} expected={}", node.attached_shard_count, node.shard_count, expected_attached_shards_per_node);
}
if node.attached_shard_count < expected_attached_shards_per_node {
expected_attached_shards_per_node - node.attached_shard_count
} else {
0
}
}
pub(crate) fn expected_attached_shard_count(&self) -> usize {
let total_attached_shards: usize =
self.nodes.values().map(|n| n.attached_shard_count).sum();
assert!(!self.nodes.is_empty());
total_attached_shards / self.nodes.len()
}
pub(crate) fn nodes_by_attached_shard_count(&self) -> Vec<(NodeId, usize)> {
self.nodes
.iter()
.map(|(node_id, stats)| (*node_id, stats.attached_shard_count))
.sorted_by(|lhs, rhs| Ord::cmp(&lhs.1, &rhs.1).reverse())
.collect()
}
pub(crate) fn node_upsert(&mut self, node: &Node) {
use std::collections::hash_map::Entry::*;
match self.nodes.entry(node.get_id()) {
@@ -391,7 +352,7 @@ impl Scheduler {
return Err(ScheduleError::NoPageservers);
}
let mut scores: Vec<(NodeId, AffinityScore, usize, usize)> = self
let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
.nodes
.iter()
.filter_map(|(k, v)| {
@@ -402,7 +363,6 @@ impl Scheduler {
*k,
context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
v.shard_count,
v.attached_shard_count,
))
}
})
@@ -410,12 +370,9 @@ impl Scheduler {
// Sort by, in order of precedence:
// 1st: Affinity score. We should never pick a higher-score node if a lower-score node is available
// 2nd: Attached shard count. Within nodes with the same affinity, we always pick the node with
// the least number of attached shards.
// 3rd: Total shard count. Within nodes with the same affinity and attached shard count, use nodes
// with the lower total shard count.
// 4th: Node ID. This is a convenience to make selection deterministic in tests and empty systems.
scores.sort_by_key(|i| (i.1, i.3, i.2, i.0));
// 2nd: Utilization. Within nodes with the same affinity, use the least loaded nodes.
// 3rd: Node ID. This is a convenience to make selection deterministic in tests and empty systems.
scores.sort_by_key(|i| (i.1, i.2, i.0));
if scores.is_empty() {
// After applying constraints, no pageservers were left.

View File

@@ -2,24 +2,19 @@ use std::{
borrow::Cow,
cmp::Ordering,
collections::{BTreeMap, HashMap, HashSet},
path::PathBuf,
str::FromStr,
sync::Arc,
time::{Duration, Instant},
};
use crate::{
background_node_operations::{
Drain, Fill, Operation, OperationError, OperationHandler, MAX_RECONCILES_PER_OPERATION,
},
compute_hook::NotifyError,
id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, TracingExclusiveGuard},
id_lock_map::{trace_exclusive_lock, trace_shared_lock, IdLockMap, WrappedWriteGuard},
persistence::{AbortShardSplitStatus, TenantFilter},
reconciler::{ReconcileError, ReconcileUnits},
scheduler::{MaySchedule, ScheduleContext, ScheduleMode},
tenant_shard::{
MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
ScheduleOptimizationAction,
MigrateAttachment, ReconcileNeeded, ScheduleOptimization, ScheduleOptimizationAction,
},
};
use anyhow::Context;
@@ -139,11 +134,6 @@ struct ServiceState {
scheduler: Scheduler,
/// Ongoing background operation on the cluster if any is running.
/// Note that only one such operation may run at any given time,
/// hence the type choice.
ongoing_operation: Option<OperationHandler>,
/// Queue of tenants who are waiting for concurrency limits to permit them to reconcile
delayed_reconcile_rx: tokio::sync::mpsc::Receiver<TenantShardId>,
}
@@ -195,7 +185,6 @@ impl ServiceState {
tenants,
nodes: Arc::new(nodes),
scheduler,
ongoing_operation: None,
delayed_reconcile_rx,
}
}
@@ -237,9 +226,6 @@ pub struct Config {
/// How large must a shard grow in bytes before we split it?
/// None disables auto-splitting.
pub split_threshold: Option<u64>,
// TODO: make this cfg(feature = "testing")
pub neon_local_repo_dir: Option<PathBuf>,
}
impl From<DatabaseError> for ApiError {
@@ -310,17 +296,6 @@ impl From<ReconcileWaitError> for ApiError {
}
}
impl From<OperationError> for ApiError {
fn from(value: OperationError) -> Self {
match value {
OperationError::NodeStateChanged(err) | OperationError::FinalizeError(err) => {
ApiError::InternalServerError(anyhow::anyhow!(err))
}
OperationError::Cancelled => ApiError::Conflict("Operation was cancelled".into()),
}
}
}
#[allow(clippy::large_enum_variant)]
enum TenantCreateOrUpdate {
Create(TenantCreateRequest),
@@ -359,7 +334,7 @@ struct TenantShardSplitAbort {
new_shard_count: ShardCount,
new_stripe_size: Option<ShardStripeSize>,
/// Until this abort op is complete, no other operations may be done on the tenant
_tenant_lock: TracingExclusiveGuard<TenantOperations>,
_tenant_lock: WrappedWriteGuard<TenantOperations>,
}
#[derive(thiserror::Error, Debug)]
@@ -1238,14 +1213,13 @@ impl Service {
let locked = self.inner.write().unwrap();
!locked.tenants.contains_key(&attach_req.tenant_shard_id)
};
if insert {
let tsp = TenantShardPersistence {
tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32,
shard_stripe_size: 0,
generation: attach_req.generation_override.or(Some(0)),
generation: Some(0),
generation_pageserver: None,
placement_policy: serde_json::to_string(&PlacementPolicy::Attached(0)).unwrap(),
config: serde_json::to_string(&TenantConfig::default()).unwrap(),
@@ -1429,7 +1403,7 @@ impl Service {
async fn node_activate_reconcile(
&self,
mut node: Node,
_lock: &TracingExclusiveGuard<NodeOperations>,
_lock: &WrappedWriteGuard<NodeOperations>,
) -> Result<(), ApiError> {
// This Node is a mutable local copy: we will set it active so that we can use its
// API client to reconcile with the node. The Node in [`Self::nodes`] will get updated
@@ -1620,32 +1594,15 @@ impl Service {
// Setting a node active unblocks any Reconcilers that might write to the location config API,
// but those requests will not be accepted by the node until it has finished processing
// the re-attach response.
//
// Additionally, reset the nodes scheduling policy to match the conditional update done
// in [`Persistence::re_attach`].
if let Some(node) = nodes.get(&reattach_req.node_id) {
let reset_scheduling = matches!(
node.get_scheduling(),
NodeSchedulingPolicy::PauseForRestart
| NodeSchedulingPolicy::Draining
| NodeSchedulingPolicy::Filling
);
if !node.is_available() || reset_scheduling {
if !node.is_available() {
let mut new_nodes = (**nodes).clone();
if let Some(node) = new_nodes.get_mut(&reattach_req.node_id) {
if !node.is_available() {
node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
}
if reset_scheduling {
node.set_scheduling(NodeSchedulingPolicy::Active);
}
node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
scheduler.node_upsert(node);
let new_nodes = Arc::new(new_nodes);
*nodes = new_nodes;
}
let new_nodes = Arc::new(new_nodes);
*nodes = new_nodes;
}
}
@@ -1926,25 +1883,6 @@ impl Service {
Ok(())
}
/// Same as [`Service::await_waiters`], but returns the waiters which are still
/// in progress
async fn await_waiters_remainder(
&self,
waiters: Vec<ReconcilerWaiter>,
timeout: Duration,
) -> Vec<ReconcilerWaiter> {
let deadline = Instant::now().checked_add(timeout).unwrap();
for waiter in waiters.iter() {
let timeout = deadline.duration_since(Instant::now());
let _ = waiter.wait_timeout(timeout).await;
}
waiters
.into_iter()
.filter(|waiter| matches!(waiter.get_status(), ReconcilerStatus::InProgress))
.collect::<Vec<_>>()
}
/// Part of [`Self::tenant_location_config`]: dissect an incoming location config request,
/// and transform it into either a tenant creation of a series of shard updates.
///
@@ -2658,7 +2596,6 @@ impl Service {
TenantOperations::TimelineCreate,
)
.await;
failpoint_support::sleep_millis_async!("tenant-create-timeline-shared-lock");
self.ensure_attached_wait(tenant_id).await?;
@@ -4227,18 +4164,6 @@ impl Service {
Ok(nodes)
}
pub(crate) async fn get_node(&self, node_id: NodeId) -> Result<Node, ApiError> {
self.inner
.read()
.unwrap()
.nodes
.get(&node_id)
.cloned()
.ok_or(ApiError::NotFound(
format!("Node {node_id} not registered").into(),
))
}
pub(crate) async fn node_register(
&self,
register_req: NodeRegisterRequest,
@@ -4393,6 +4318,9 @@ impl Service {
if let Some(scheduling) = scheduling {
node.set_scheduling(scheduling);
// TODO: once we have a background scheduling ticker for fill/drain, kick it
// to wake up and start working.
}
// Update the scheduler, in case the elegibility of the node for new shards has changed
@@ -4483,7 +4411,7 @@ impl Service {
// TODO: in the background, we should balance work back onto this pageserver
}
AvailabilityTransition::Unchanged => {
tracing::debug!("Node {} no availability change during config", node_id);
tracing::debug!("Node {} no change during config", node_id);
}
}
@@ -4492,283 +4420,6 @@ impl Service {
Ok(())
}
pub(crate) async fn start_node_drain(
self: &Arc<Self>,
node_id: NodeId,
) -> Result<(), ApiError> {
let (ongoing_op, node_available, node_policy, schedulable_nodes_count) = {
let locked = self.inner.read().unwrap();
let nodes = &locked.nodes;
let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
anyhow::anyhow!("Node {} not registered", node_id).into(),
))?;
let schedulable_nodes_count = nodes
.iter()
.filter(|(_, n)| matches!(n.may_schedule(), MaySchedule::Yes(_)))
.count();
(
locked
.ongoing_operation
.as_ref()
.map(|ongoing| ongoing.operation),
node.is_available(),
node.get_scheduling(),
schedulable_nodes_count,
)
};
if let Some(ongoing) = ongoing_op {
return Err(ApiError::PreconditionFailed(
format!("Background operation already ongoing for node: {}", ongoing).into(),
));
}
if !node_available {
return Err(ApiError::ResourceUnavailable(
format!("Node {node_id} is currently unavailable").into(),
));
}
if schedulable_nodes_count == 0 {
return Err(ApiError::PreconditionFailed(
"No other schedulable nodes to drain to".into(),
));
}
match node_policy {
NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Pause => {
self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Draining))
.await?;
let cancel = self.cancel.child_token();
let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
operation: Operation::Drain(Drain { node_id }),
cancel: cancel.clone(),
});
tokio::task::spawn({
let service = self.clone();
let cancel = cancel.clone();
async move {
let _gate_guard = gate_guard;
scopeguard::defer! {
let prev = service.inner.write().unwrap().ongoing_operation.take();
if let Some(Operation::Drain(removed_drain)) = prev.map(|h| h.operation) {
assert_eq!(removed_drain.node_id, node_id, "We always take the same operation");
} else {
panic!("We always remove the same operation")
}
}
tracing::info!(%node_id, "Drain background operation starting");
let res = service.drain_node(node_id, cancel).await;
match res {
Ok(()) => {
tracing::info!(%node_id, "Drain background operation completed successfully");
}
Err(OperationError::Cancelled) => {
tracing::info!(%node_id, "Drain background operation was cancelled");
}
Err(err) => {
tracing::error!(%node_id, "Drain background operation encountered: {err}")
}
}
}
});
}
NodeSchedulingPolicy::Draining => {
return Err(ApiError::Conflict(format!(
"Node {node_id} has drain in progress"
)));
}
policy => {
return Err(ApiError::PreconditionFailed(
format!("Node {node_id} cannot be drained due to {policy:?} policy").into(),
));
}
}
Ok(())
}
pub(crate) async fn cancel_node_drain(&self, node_id: NodeId) -> Result<(), ApiError> {
let (node_available, node_policy) = {
let locked = self.inner.read().unwrap();
let nodes = &locked.nodes;
let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
anyhow::anyhow!("Node {} not registered", node_id).into(),
))?;
(node.is_available(), node.get_scheduling())
};
if !node_available {
return Err(ApiError::ResourceUnavailable(
format!("Node {node_id} is currently unavailable").into(),
));
}
if !matches!(node_policy, NodeSchedulingPolicy::Draining) {
return Err(ApiError::PreconditionFailed(
format!("Node {node_id} has no drain in progress").into(),
));
}
if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
if let Operation::Drain(drain) = op_handler.operation {
if drain.node_id == node_id {
tracing::info!("Cancelling background drain operation for node {node_id}");
op_handler.cancel.cancel();
return Ok(());
}
}
}
Err(ApiError::PreconditionFailed(
format!("Node {node_id} has no drain in progress").into(),
))
}
pub(crate) async fn start_node_fill(self: &Arc<Self>, node_id: NodeId) -> Result<(), ApiError> {
let (ongoing_op, node_available, node_policy, total_nodes_count) = {
let locked = self.inner.read().unwrap();
let nodes = &locked.nodes;
let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
anyhow::anyhow!("Node {} not registered", node_id).into(),
))?;
(
locked
.ongoing_operation
.as_ref()
.map(|ongoing| ongoing.operation),
node.is_available(),
node.get_scheduling(),
nodes.len(),
)
};
if let Some(ongoing) = ongoing_op {
return Err(ApiError::PreconditionFailed(
format!("Background operation already ongoing for node: {}", ongoing).into(),
));
}
if !node_available {
return Err(ApiError::ResourceUnavailable(
format!("Node {node_id} is currently unavailable").into(),
));
}
if total_nodes_count <= 1 {
return Err(ApiError::PreconditionFailed(
"No other nodes to fill from".into(),
));
}
match node_policy {
NodeSchedulingPolicy::Active => {
self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Filling))
.await?;
let cancel = self.cancel.child_token();
let gate_guard = self.gate.enter().map_err(|_| ApiError::ShuttingDown)?;
self.inner.write().unwrap().ongoing_operation = Some(OperationHandler {
operation: Operation::Fill(Fill { node_id }),
cancel: cancel.clone(),
});
tokio::task::spawn({
let service = self.clone();
let cancel = cancel.clone();
async move {
let _gate_guard = gate_guard;
scopeguard::defer! {
let prev = service.inner.write().unwrap().ongoing_operation.take();
if let Some(Operation::Fill(removed_fill)) = prev.map(|h| h.operation) {
assert_eq!(removed_fill.node_id, node_id, "We always take the same operation");
} else {
panic!("We always remove the same operation")
}
}
tracing::info!(%node_id, "Fill background operation starting");
let res = service.fill_node(node_id, cancel).await;
match res {
Ok(()) => {
tracing::info!(%node_id, "Fill background operation completed successfully");
}
Err(OperationError::Cancelled) => {
tracing::info!(%node_id, "Fill background operation was cancelled");
}
Err(err) => {
tracing::error!(%node_id, "Fill background operation encountered: {err}")
}
}
}
});
}
NodeSchedulingPolicy::Filling => {
return Err(ApiError::Conflict(format!(
"Node {node_id} has fill in progress"
)));
}
policy => {
return Err(ApiError::PreconditionFailed(
format!("Node {node_id} cannot be filled due to {policy:?} policy").into(),
));
}
}
Ok(())
}
pub(crate) async fn cancel_node_fill(&self, node_id: NodeId) -> Result<(), ApiError> {
let (node_available, node_policy) = {
let locked = self.inner.read().unwrap();
let nodes = &locked.nodes;
let node = nodes.get(&node_id).ok_or(ApiError::NotFound(
anyhow::anyhow!("Node {} not registered", node_id).into(),
))?;
(node.is_available(), node.get_scheduling())
};
if !node_available {
return Err(ApiError::ResourceUnavailable(
format!("Node {node_id} is currently unavailable").into(),
));
}
if !matches!(node_policy, NodeSchedulingPolicy::Filling) {
return Err(ApiError::PreconditionFailed(
format!("Node {node_id} has no fill in progress").into(),
));
}
if let Some(op_handler) = self.inner.read().unwrap().ongoing_operation.as_ref() {
if let Operation::Fill(fill) = op_handler.operation {
if fill.node_id == node_id {
tracing::info!("Cancelling background drain operation for node {node_id}");
op_handler.cancel.cancel();
return Ok(());
}
}
}
Err(ApiError::PreconditionFailed(
format!("Node {node_id} has no fill in progress").into(),
))
}
/// Helper for methods that will try and call pageserver APIs for
/// a tenant, such as timeline CRUD: they cannot proceed unless the tenant
/// is attached somewhere.
@@ -5353,383 +5004,4 @@ impl Service {
// to complete.
self.gate.close().await;
}
/// Drain a node by moving the shards attached to it as primaries.
/// This is a long running operation and it should run as a separate Tokio task.
pub(crate) async fn drain_node(
&self,
node_id: NodeId,
cancel: CancellationToken,
) -> Result<(), OperationError> {
let mut last_inspected_shard: Option<TenantShardId> = None;
let mut inspected_all_shards = false;
let mut waiters = Vec::new();
while !inspected_all_shards {
if cancel.is_cancelled() {
match self
.node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
.await
{
Ok(()) => return Err(OperationError::Cancelled),
Err(err) => {
return Err(OperationError::FinalizeError(
format!(
"Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
node_id, err
)
.into(),
));
}
}
}
{
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, scheduler) = locked.parts_mut();
let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged(
format!("node {node_id} was removed").into(),
))?;
let current_policy = node.get_scheduling();
if !matches!(current_policy, NodeSchedulingPolicy::Draining) {
// TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
// about it
return Err(OperationError::NodeStateChanged(
format!("node {node_id} changed state to {current_policy:?}").into(),
));
}
let mut cursor = tenants.iter_mut().skip_while({
let skip_past = last_inspected_shard;
move |(tid, _)| match skip_past {
Some(last) => **tid != last,
None => false,
}
});
while waiters.len() < MAX_RECONCILES_PER_OPERATION {
let (tid, tenant_shard) = match cursor.next() {
Some(some) => some,
None => {
inspected_all_shards = true;
break;
}
};
// If the shard is not attached to the node being drained, skip it.
if *tenant_shard.intent.get_attached() != Some(node_id) {
last_inspected_shard = Some(*tid);
continue;
}
match tenant_shard.reschedule_to_secondary(None, scheduler) {
Err(e) => {
tracing::warn!(
tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
"Scheduling error when draining pageserver {} : {e}", node_id
);
}
Ok(()) => {
let scheduled_to = tenant_shard.intent.get_attached();
tracing::info!(
tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
"Rescheduled shard while draining node {}: {} -> {:?}",
node_id,
node_id,
scheduled_to
);
let waiter = self.maybe_reconcile_shard(tenant_shard, nodes);
if let Some(some) = waiter {
waiters.push(some);
}
}
}
last_inspected_shard = Some(*tid);
}
}
waiters = self
.await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
.await;
failpoint_support::sleep_millis_async!("sleepy-drain-loop");
}
while !waiters.is_empty() {
if cancel.is_cancelled() {
match self
.node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
.await
{
Ok(()) => return Err(OperationError::Cancelled),
Err(err) => {
return Err(OperationError::FinalizeError(
format!(
"Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
node_id, err
)
.into(),
));
}
}
}
tracing::info!("Awaiting {} pending drain reconciliations", waiters.len());
waiters = self
.await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
.await;
}
// At this point we have done the best we could to drain shards from this node.
// Set the node scheduling policy to `[NodeSchedulingPolicy::PauseForRestart]`
// to complete the drain.
if let Err(err) = self
.node_configure(node_id, None, Some(NodeSchedulingPolicy::PauseForRestart))
.await
{
// This is not fatal. Anything that is polling the node scheduling policy to detect
// the end of the drain operations will hang, but all such places should enforce an
// overall timeout. The scheduling policy will be updated upon node re-attach and/or
// by the counterpart fill operation.
return Err(OperationError::FinalizeError(
format!(
"Failed to finalise drain of {node_id} by setting scheduling policy to PauseForRestart: {err}"
)
.into(),
));
}
Ok(())
}
/// Create a node fill plan (pick secondaries to promote) that meets the following requirements:
/// 1. The node should be filled until it reaches the expected cluster average of
/// attached shards. If there are not enough secondaries on the node, the plan stops early.
/// 2. Select tenant shards to promote such that the number of attached shards is balanced
/// throughout the cluster. We achieve this by picking tenant shards from each node,
/// starting from the ones with the largest number of attached shards, until the node
/// reaches the expected cluster average.
/// 3. Avoid promoting more shards of the same tenant than required. The upper bound
/// for the number of tenants from the same shard promoted to the node being filled is:
/// shard count for the tenant divided by the number of nodes in the cluster.
fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
let mut locked = self.inner.write().unwrap();
let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
let mut tids_by_node = locked
.tenants
.iter_mut()
.filter_map(|(tid, tenant_shard)| {
if tenant_shard.intent.get_secondary().contains(&node_id) {
if let Some(primary) = tenant_shard.intent.get_attached() {
return Some((*primary, *tid));
}
}
None
})
.into_group_map();
let expected_attached = locked.scheduler.expected_attached_shard_count();
let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();
let mut promoted_per_tenant: HashMap<TenantId, usize> = HashMap::new();
let mut plan = Vec::new();
for (node_id, attached) in nodes_by_load {
let available = locked
.nodes
.get(&node_id)
.map_or(false, |n| n.is_available());
if !available {
continue;
}
if plan.len() >= fill_requirement
|| tids_by_node.is_empty()
|| attached <= expected_attached
{
break;
}
let mut can_take = attached - expected_attached;
let mut remove_node = false;
while can_take > 0 {
match tids_by_node.get_mut(&node_id) {
Some(tids) => match tids.pop() {
Some(tid) => {
let max_promote_for_tenant = std::cmp::max(
tid.shard_count.count() as usize / locked.nodes.len(),
1,
);
let promoted = promoted_per_tenant.entry(tid.tenant_id).or_default();
if *promoted < max_promote_for_tenant {
plan.push(tid);
*promoted += 1;
can_take -= 1;
}
}
None => {
remove_node = true;
break;
}
},
None => {
break;
}
}
}
if remove_node {
tids_by_node.remove(&node_id);
}
}
plan
}
/// Fill a node by promoting its secondaries until the cluster is balanced
/// with regards to attached shard counts. Note that this operation only
/// makes sense as a counterpart to the drain implemented in [`Service::drain_node`].
/// This is a long running operation and it should run as a separate Tokio task.
pub(crate) async fn fill_node(
&self,
node_id: NodeId,
cancel: CancellationToken,
) -> Result<(), OperationError> {
// TODO(vlad): Currently this operates on the assumption that all
// secondaries are warm. This is not always true (e.g. we just migrated the
// tenant). Take that into consideration by checking the secondary status.
let mut tids_to_promote = self.fill_node_plan(node_id);
let mut waiters = Vec::new();
// Execute the plan we've composed above. Before aplying each move from the plan,
// we validate to ensure that it has not gone stale in the meantime.
while !tids_to_promote.is_empty() {
if cancel.is_cancelled() {
match self
.node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
.await
{
Ok(()) => return Err(OperationError::Cancelled),
Err(err) => {
return Err(OperationError::FinalizeError(
format!(
"Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
node_id, err
)
.into(),
));
}
}
}
{
let mut locked = self.inner.write().unwrap();
let (nodes, tenants, scheduler) = locked.parts_mut();
let node = nodes.get(&node_id).ok_or(OperationError::NodeStateChanged(
format!("node {node_id} was removed").into(),
))?;
let current_policy = node.get_scheduling();
if !matches!(current_policy, NodeSchedulingPolicy::Filling) {
// TODO(vlad): maybe cancel pending reconciles before erroring out. need to think
// about it
return Err(OperationError::NodeStateChanged(
format!("node {node_id} changed state to {current_policy:?}").into(),
));
}
while waiters.len() < MAX_RECONCILES_PER_OPERATION {
if let Some(tid) = tids_to_promote.pop() {
if let Some(tenant_shard) = tenants.get_mut(&tid) {
// If the node being filled is not a secondary anymore,
// skip the promotion.
if !tenant_shard.intent.get_secondary().contains(&node_id) {
continue;
}
let previously_attached_to = *tenant_shard.intent.get_attached();
match tenant_shard.reschedule_to_secondary(Some(node_id), scheduler) {
Err(e) => {
tracing::warn!(
tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
"Scheduling error when filling pageserver {} : {e}", node_id
);
}
Ok(()) => {
tracing::info!(
tenant_id=%tid.tenant_id, shard_id=%tid.shard_slug(),
"Rescheduled shard while filling node {}: {:?} -> {}",
node_id,
previously_attached_to,
node_id
);
if let Some(waiter) =
self.maybe_reconcile_shard(tenant_shard, nodes)
{
waiters.push(waiter);
}
}
}
}
} else {
break;
}
}
}
waiters = self
.await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
.await;
}
while !waiters.is_empty() {
if cancel.is_cancelled() {
match self
.node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
.await
{
Ok(()) => return Err(OperationError::Cancelled),
Err(err) => {
return Err(OperationError::FinalizeError(
format!(
"Failed to finalise drain cancel of {} by setting scheduling policy to Active: {}",
node_id, err
)
.into(),
));
}
}
}
tracing::info!("Awaiting {} pending fill reconciliations", waiters.len());
waiters = self
.await_waiters_remainder(waiters, SHORT_RECONCILE_TIMEOUT)
.await;
}
if let Err(err) = self
.node_configure(node_id, None, Some(NodeSchedulingPolicy::Active))
.await
{
// This isn't a huge issue since the filling process starts upon request. However, it
// will prevent the next drain from starting. The only case in which this can fail
// is database unavailability. Such a case will require manual intervention.
return Err(OperationError::FinalizeError(
format!("Failed to finalise fill of {node_id} by setting scheduling policy to Active: {err}")
.into(),
));
}
Ok(())
}
}

View File

@@ -10,9 +10,7 @@ use crate::{
reconciler::ReconcileUnits,
scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext},
};
use pageserver_api::controller_api::{
NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy,
};
use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
use pageserver_api::{
models::{LocationConfig, LocationConfigMode, TenantConfig},
shard::{ShardIdentity, TenantShardId},
@@ -313,12 +311,6 @@ pub(crate) struct ReconcilerWaiter {
seq: Sequence,
}
pub(crate) enum ReconcilerStatus {
Done,
Failed,
InProgress,
}
#[derive(thiserror::Error, Debug)]
pub(crate) enum ReconcileWaitError {
#[error("Timeout waiting for shard {0}")]
@@ -381,16 +373,6 @@ impl ReconcilerWaiter {
Ok(())
}
pub(crate) fn get_status(&self) -> ReconcilerStatus {
if self.seq_wait.would_wait_for(self.seq).is_err() {
ReconcilerStatus::Done
} else if self.error_seq_wait.would_wait_for(self.seq).is_err() {
ReconcilerStatus::Failed
} else {
ReconcilerStatus::InProgress
}
}
}
/// Having spawned a reconciler task, the tenant shard's state will carry enough
@@ -646,48 +628,6 @@ impl TenantShard {
Ok(())
}
/// Reschedule this tenant shard to one of its secondary locations. Returns a scheduling error
/// if the swap is not possible and leaves the intent state in its original state.
///
/// Arguments:
/// `attached_to`: the currently attached location matching the intent state (may be None if the
/// shard is not attached)
/// `promote_to`: an optional secondary location of this tenant shard. If set to None, we ask
/// the scheduler to recommend a node
pub(crate) fn reschedule_to_secondary(
&mut self,
promote_to: Option<NodeId>,
scheduler: &mut Scheduler,
) -> Result<(), ScheduleError> {
let promote_to = match promote_to {
Some(node) => node,
None => match scheduler.node_preferred(self.intent.get_secondary()) {
Some(node) => node,
None => {
return Err(ScheduleError::ImpossibleConstraint);
}
},
};
assert!(self.intent.get_secondary().contains(&promote_to));
if let Some(node) = self.intent.get_attached() {
let demoted = self.intent.demote_attached(scheduler, *node);
if !demoted {
return Err(ScheduleError::ImpossibleConstraint);
}
}
self.intent.promote_attached(scheduler, promote_to);
// Increment the sequence number for the edge case where a
// reconciler is already running to avoid waiting on the
// current reconcile instead of spawning a new one.
self.sequence = self.sequence.next();
Ok(())
}
/// Optimize attachments: if a shard has a secondary location that is preferable to
/// its primary location based on soft constraints, switch that secondary location
/// to be attached.
@@ -712,17 +652,13 @@ impl TenantShard {
let mut scores = all_pageservers
.iter()
.flat_map(|node_id| {
let node = nodes.get(node_id);
if node.is_none() {
None
} else if matches!(
node.unwrap().get_scheduling(),
NodeSchedulingPolicy::Filling
if matches!(
nodes
.get(node_id)
.map(|n| n.may_schedule())
.unwrap_or(MaySchedule::No),
MaySchedule::No
) {
// If the node is currently filling, don't count it as a candidate to avoid,
// racing with the background fill.
None
} else if matches!(node.unwrap().may_schedule(), MaySchedule::No) {
None
} else {
let affinity_score = schedule_context.get_node_affinity(*node_id);
@@ -1674,10 +1610,14 @@ pub(crate) mod tests {
// We should see equal number of locations on the two nodes.
assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 2);
// Scheduling does not consider the number of attachments picking the initial
// pageserver to attach to (hence the assertion that all primaries are on the
// same node)
// TODO: Tweak the scheduling to evenly distribute attachments for new shards.
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 4);
assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 2);
assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0);
// Add another two nodes: we should see the shards spread out when their optimize
// methods are called

View File

@@ -118,6 +118,8 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
"libmetrics_launch_timestamp",
"libmetrics_build_info",
"libmetrics_tracing_event_count_total",
"pageserver_materialized_cache_hits_total",
"pageserver_materialized_cache_hits_direct_total",
"pageserver_page_cache_read_hits_total",
"pageserver_page_cache_read_accesses_total",
"pageserver_page_cache_size_current_bytes",

View File

@@ -1177,10 +1177,10 @@ class NeonEnv:
force=config.config_init_force,
)
def start(self, timeout_in_seconds: Optional[int] = None):
def start(self):
# Storage controller starts first, so that pageserver /re-attach calls don't
# bounce through retries on startup
self.storage_controller.start(timeout_in_seconds=timeout_in_seconds)
self.storage_controller.start()
# Wait for storage controller readiness to prevent unnecessary post start-up
# reconcile.
@@ -1196,18 +1196,10 @@ class NeonEnv:
) # The `or None` is for the linter
for pageserver in self.pageservers:
futs.append(
executor.submit(
lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds)
)
)
futs.append(executor.submit(lambda ps=pageserver: ps.start()))
for safekeeper in self.safekeepers:
futs.append(
executor.submit(
lambda sk=safekeeper: sk.start(timeout_in_seconds=timeout_in_seconds)
)
)
futs.append(executor.submit(lambda sk=safekeeper: sk.start()))
for f in futs:
f.result()
@@ -1791,13 +1783,8 @@ class NeonCli(AbstractNeonCli):
res.check_returncode()
return res
def storage_controller_start(
self,
timeout_in_seconds: Optional[int] = None,
):
def storage_controller_start(self):
cmd = ["storage_controller", "start"]
if timeout_in_seconds is not None:
cmd.append(f"--start-timeout={timeout_in_seconds}s")
return self.raw_cli(cmd)
def storage_controller_stop(self, immediate: bool):
@@ -1810,11 +1797,8 @@ class NeonCli(AbstractNeonCli):
self,
id: int,
extra_env_vars: Optional[Dict[str, str]] = None,
timeout_in_seconds: Optional[int] = None,
) -> "subprocess.CompletedProcess[str]":
start_args = ["pageserver", "start", f"--id={id}"]
if timeout_in_seconds is not None:
start_args.append(f"--start-timeout={timeout_in_seconds}s")
storage = self.env.pageserver_remote_storage
if isinstance(storage, S3Storage):
@@ -1832,10 +1816,7 @@ class NeonCli(AbstractNeonCli):
return self.raw_cli(cmd)
def safekeeper_start(
self,
id: int,
extra_opts: Optional[List[str]] = None,
timeout_in_seconds: Optional[int] = None,
self, id: int, extra_opts: Optional[List[str]] = None
) -> "subprocess.CompletedProcess[str]":
s3_env_vars = None
if isinstance(self.env.safekeepers_remote_storage, S3Storage):
@@ -1845,8 +1826,6 @@ class NeonCli(AbstractNeonCli):
extra_opts = [f"-e={opt}" for opt in extra_opts]
else:
extra_opts = []
if timeout_in_seconds is not None:
extra_opts.append(f"--start-timeout={timeout_in_seconds}s")
return self.raw_cli(
["safekeeper", "start", str(id), *extra_opts], extra_env_vars=s3_env_vars
)
@@ -2098,9 +2077,9 @@ class NeonStorageController(MetricsGetter, LogUtils):
self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
self.logfile = self.workdir / "storage_controller.log"
def start(self, timeout_in_seconds: Optional[int] = None):
def start(self):
assert not self.running
self.env.neon_cli.storage_controller_start(timeout_in_seconds)
self.env.neon_cli.storage_controller_start()
self.running = True
return self
@@ -2180,19 +2159,12 @@ class NeonStorageController(MetricsGetter, LogUtils):
return time.time() - t1
def attach_hook_issue(
self,
tenant_shard_id: Union[TenantId, TenantShardId],
pageserver_id: int,
generation_override: Optional[int] = None,
self, tenant_shard_id: Union[TenantId, TenantShardId], pageserver_id: int
) -> int:
body = {"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id}
if generation_override is not None:
body["generation_override"] = generation_override
response = self.request(
"POST",
f"{self.env.storage_controller_api}/debug/v1/attach-hook",
json=body,
json={"tenant_shard_id": str(tenant_shard_id), "node_id": pageserver_id},
headers=self.headers(TokenScope.ADMIN),
)
gen = response.json()["gen"]
@@ -2241,46 +2213,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
headers=self.headers(TokenScope.ADMIN),
)
def node_drain(self, node_id):
log.info(f"node_drain({node_id})")
self.request(
"PUT",
f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
headers=self.headers(TokenScope.ADMIN),
)
def cancel_node_drain(self, node_id):
log.info(f"cancel_node_drain({node_id})")
self.request(
"DELETE",
f"{self.env.storage_controller_api}/control/v1/node/{node_id}/drain",
headers=self.headers(TokenScope.ADMIN),
)
def node_fill(self, node_id):
log.info(f"node_fill({node_id})")
self.request(
"PUT",
f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
headers=self.headers(TokenScope.ADMIN),
)
def cancel_node_fill(self, node_id):
log.info(f"cancel_node_fill({node_id})")
self.request(
"DELETE",
f"{self.env.storage_controller_api}/control/v1/node/{node_id}/fill",
headers=self.headers(TokenScope.ADMIN),
)
def node_status(self, node_id):
response = self.request(
"GET",
f"{self.env.storage_controller_api}/control/v1/node/{node_id}",
headers=self.headers(TokenScope.ADMIN),
)
return response.json()
def node_list(self):
response = self.request(
"GET",
@@ -2568,7 +2500,6 @@ class NeonPageserver(PgProtocol, LogUtils):
def start(
self,
extra_env_vars: Optional[Dict[str, str]] = None,
timeout_in_seconds: Optional[int] = None,
) -> "NeonPageserver":
"""
Start the page server.
@@ -2577,9 +2508,7 @@ class NeonPageserver(PgProtocol, LogUtils):
"""
assert self.running is False
self.env.neon_cli.pageserver_start(
self.id, extra_env_vars=extra_env_vars, timeout_in_seconds=timeout_in_seconds
)
self.env.neon_cli.pageserver_start(self.id, extra_env_vars=extra_env_vars)
self.running = True
return self
@@ -2593,17 +2522,13 @@ class NeonPageserver(PgProtocol, LogUtils):
self.running = False
return self
def restart(
self,
immediate: bool = False,
timeout_in_seconds: Optional[int] = None,
):
def restart(self, immediate: bool = False):
"""
High level wrapper for restart: restarts the process, and waits for
tenant state to stabilize.
"""
self.stop(immediate=immediate)
self.start(timeout_in_seconds=timeout_in_seconds)
self.start()
self.quiesce_tenants()
def quiesce_tenants(self):
@@ -2686,7 +2611,6 @@ class NeonPageserver(PgProtocol, LogUtils):
config: None | Dict[str, Any] = None,
config_null: bool = False,
generation: Optional[int] = None,
override_storage_controller_generation: bool = False,
):
"""
Tenant attachment passes through here to acquire a generation number before proceeding
@@ -2695,10 +2619,6 @@ class NeonPageserver(PgProtocol, LogUtils):
client = self.http_client()
if generation is None:
generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
elif override_storage_controller_generation:
generation = self.env.storage_controller.attach_hook_issue(
tenant_id, self.id, generation
)
return client.tenant_attach(
tenant_id,
config,
@@ -2744,6 +2664,12 @@ class NeonPageserver(PgProtocol, LogUtils):
client = self.http_client(auth_token=auth_token)
return client.tenant_create(tenant_id, conf, generation=generation)
def tenant_load(self, tenant_id: TenantId):
client = self.http_client()
return client.tenant_load(
tenant_id, generation=self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
)
def list_layers(
self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
) -> list[Path]:
@@ -3484,13 +3410,6 @@ class Endpoint(PgProtocol, LogUtils):
self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers))
# path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf
# Semaphore is set to 1 when we start, and acquire'd back to zero when we stop
#
# We use a semaphore rather than a bool so that racing calls to stop() don't
# try and stop the same process twice, as stop() is called by test teardown and
# potentially by some __del__ chains in other threads.
self._running = threading.Semaphore(0)
def http_client(
self, auth_token: Optional[str] = None, retries: Optional[Retry] = None
) -> EndpointHttpClient:
@@ -3568,7 +3487,7 @@ class Endpoint(PgProtocol, LogUtils):
pageserver_id=pageserver_id,
allow_multiple=allow_multiple,
)
self._running.release(1)
self.running = True
return self
@@ -3616,12 +3535,9 @@ class Endpoint(PgProtocol, LogUtils):
conf_file.write("\n".join(hba) + "\n")
conf_file.write(data)
if self.is_running():
if self.running:
self.safe_psql("SELECT pg_reload_conf()")
def is_running(self):
return self._running._value > 0
def reconfigure(self, pageserver_id: Optional[int] = None):
assert self.endpoint_id is not None
self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id)
@@ -3663,19 +3579,15 @@ class Endpoint(PgProtocol, LogUtils):
def stop(self, mode: str = "fast") -> "Endpoint":
"""
Stop the Postgres instance if it's running.
Because test teardown might try and stop an endpoint concurrently with test code
stopping the endpoint, this method is thread safe
Returns self.
"""
running = self._running.acquire(blocking=False)
if running:
if self.running:
assert self.endpoint_id is not None
self.env.neon_cli.endpoint_stop(
self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
)
self.running = False
return self
@@ -3685,13 +3597,12 @@ class Endpoint(PgProtocol, LogUtils):
Returns self.
"""
running = self._running.acquire(blocking=False)
if running:
assert self.endpoint_id is not None
self.env.neon_cli.endpoint_stop(
self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode
)
self.endpoint_id = None
assert self.endpoint_id is not None
self.env.neon_cli.endpoint_stop(
self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode
)
self.endpoint_id = None
self.running = False
return self
@@ -3879,13 +3790,9 @@ class Safekeeper(LogUtils):
self.running = running
self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"
def start(
self, extra_opts: Optional[List[str]] = None, timeout_in_seconds: Optional[int] = None
) -> "Safekeeper":
def start(self, extra_opts: Optional[List[str]] = None) -> "Safekeeper":
assert self.running is False
self.env.neon_cli.safekeeper_start(
self.id, extra_opts=extra_opts, timeout_in_seconds=timeout_in_seconds
)
self.env.neon_cli.safekeeper_start(self.id, extra_opts=extra_opts)
self.running = True
# wait for wal acceptor start by checking its status
started_at = time.time()

View File

@@ -106,11 +106,6 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
".*startup_reconcile: Could not scan node.*",
# Tests run in dev mode
".*Starting in dev mode.*",
# Tests that stop endpoints & use the storage controller's neon_local notification
# mechanism might fail (neon_local's stopping and endpoint isn't atomic wrt the storage
# controller's attempts to notify the endpoint).
".*reconciler.*neon_local notification hook failed.*",
".*reconciler.*neon_local error.*",
]

View File

@@ -340,6 +340,17 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
self.verbose_error(res)
return res
def tenant_load(self, tenant_id: TenantId, generation=None):
body = None
if generation is not None:
body = {"generation": generation}
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load", json=body)
self.verbose_error(res)
def tenant_ignore(self, tenant_id: TenantId):
res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/ignore")
self.verbose_error(res)
def tenant_status(
self, tenant_id: Union[TenantId, TenantShardId], activate: bool = False
) -> Dict[Any, Any]:

View File

@@ -66,8 +66,6 @@ def single_timeline(
env.pageserver.tenant_attach(
tenant,
config=template_config.copy(),
generation=100,
override_storage_controller_generation=True,
)
time.sleep(0.1)
wait_until_tenant_state(ps_http, tenant, "Broken", 10)

View File

@@ -430,6 +430,52 @@ def enable_remote_storage_versioning(
return response
def wait_tenant_status_404(
pageserver_http: PageserverHttpClient,
tenant_id: TenantId,
iterations: int,
interval: float = 0.250,
):
def tenant_is_missing():
data = {}
try:
data = pageserver_http.tenant_status(tenant_id)
log.info(f"tenant status {data}")
except PageserverApiException as e:
log.debug(e)
if e.status_code == 404:
return
raise RuntimeError(f"Timeline exists state {data.get('state')}")
wait_until(iterations, interval=interval, func=tenant_is_missing)
def tenant_delete_wait_completed(
pageserver_http: PageserverHttpClient,
tenant_id: TenantId,
iterations: int,
ignore_errors: bool = False,
):
if not ignore_errors:
pageserver_http.tenant_delete(tenant_id=tenant_id)
else:
interval = 0.5
def delete_request_sent():
try:
pageserver_http.tenant_delete(tenant_id=tenant_id)
except PageserverApiException as e:
log.debug(e)
if e.status_code == 404:
return
except Exception as e:
log.debug(e)
wait_until(iterations, interval=interval, func=delete_request_sent)
wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations)
MANY_SMALL_LAYERS_TENANT_CONFIG = {
"gc_period": "0s",
"compaction_period": "0s",

View File

@@ -85,8 +85,6 @@ def test_pageserver_max_throughput_getpage_at_latest_lsn(
f"max_throughput_latest_lsn-{n_tenants}-{pgbench_scale}",
n_tenants,
setup_wrapper,
# https://github.com/neondatabase/neon/issues/8070
timeout_in_seconds=60,
)
env.pageserver.allowed_errors.append(
@@ -211,11 +209,3 @@ def run_benchmark_max_throughput_latest_lsn(
unit="ms",
report=MetricReport.LOWER_IS_BETTER,
)
env.storage_controller.allowed_errors.append(
# The test setup swaps NeonEnv instances, hence different
# pg instances are used for the storage controller db. This means
# the storage controller doesn't know about the nodes mentioned
# in attachments.json at start-up.
".* Scheduler missing node 1",
)

View File

@@ -2,7 +2,7 @@
Utilities used by all code in this sub-directory
"""
from typing import Any, Callable, Dict, Optional, Tuple
from typing import Any, Callable, Dict, Tuple
import fixtures.pageserver.many_tenants as many_tenants
from fixtures.common_types import TenantId, TimelineId
@@ -41,7 +41,6 @@ def setup_pageserver_with_tenants(
name: str,
n_tenants: int,
setup: Callable[[NeonEnv], Tuple[TenantId, TimelineId, Dict[str, Any]]],
timeout_in_seconds: Optional[int] = None,
) -> NeonEnv:
"""
Utility function to set up a pageserver with a given number of identical tenants.
@@ -51,6 +50,6 @@ def setup_pageserver_with_tenants(
return many_tenants.single_timeline(neon_env_builder, setup, n_tenants)
env = neon_env_builder.build_and_use_snapshot(name, doit)
env.start(timeout_in_seconds=timeout_in_seconds)
env.start()
ensure_pageserver_ready_for_benchmarking(env, n_tenants)
return env

View File

@@ -4,6 +4,7 @@ import pytest
from fixtures.benchmark_fixture import MetricReport
from fixtures.common_types import Lsn
from fixtures.compare_fixtures import NeonCompare, PgCompare
from fixtures.pageserver.utils import wait_tenant_status_404
from fixtures.pg_version import PgVersion
@@ -67,6 +68,7 @@ def measure_recovery_time(env: NeonCompare):
(attach_gen, _) = attach_status
client.tenant_delete(env.tenant)
wait_tenant_status_404(client, env.tenant, iterations=60, interval=0.5)
env.env.pageserver.tenant_create(tenant_id=env.tenant, generation=attach_gen)
# Measure recovery time

View File

@@ -1,5 +1,4 @@
import os
import queue
import random
import threading
import time
@@ -9,7 +8,11 @@ from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder
from fixtures.utils import query_scalar
def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder, build_type: str):
if build_type == "debug":
# Disable vectored read path cross validation since it makes the test time out.
neon_env_builder.pageserver_config_override = "validate_vectored_get=false"
env = neon_env_builder.init_start()
cache_dir = os.path.join(env.repo_dir, "file_cache")
@@ -30,10 +33,11 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
cur = endpoint.connect().cursor()
stop = threading.Event()
n_rows = 100000
n_threads = 20
n_updates_per_thread = 10000
n_updates_per_connection = 1000
n_total_updates = n_threads * n_updates_per_thread
cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)")
cur.execute(f"INSERT INTO lfctest SELECT g, 1 FROM generate_series(1, {n_rows}) g")
@@ -44,11 +48,11 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
# performed (plus the initial 1 on each row).
#
# Furthermore, each thread will reconnect between every 1000 updates.
def run_updates(n_updates_performed_q: queue.Queue[int]):
def run_updates():
n_updates_performed = 0
conn = endpoint.connect()
cur = conn.cursor()
while not stop.is_set():
for _ in range(n_updates_per_thread):
id = random.randint(1, n_rows)
cur.execute(f"UPDATE lfctest SET n = n + 1 WHERE id = {id}")
n_updates_performed += 1
@@ -57,28 +61,19 @@ def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):
conn.close()
conn = endpoint.connect()
cur = conn.cursor()
n_updates_performed_q.put(n_updates_performed)
n_updates_performed_q: queue.Queue[int] = queue.Queue()
threads: List[threading.Thread] = []
for _i in range(n_threads):
thread = threading.Thread(target=run_updates, args=(n_updates_performed_q,), daemon=True)
thread = threading.Thread(target=run_updates, args=(), daemon=True)
thread.start()
threads.append(thread)
time.sleep(5)
# unlink, this is what we're actually testing
new_cache_dir = os.path.join(env.repo_dir, "file_cache_new")
os.rename(cache_dir, new_cache_dir)
time.sleep(10)
stop.set()
n_updates_performed = 0
for thread in threads:
thread.join()
n_updates_performed += n_updates_performed_q.get()
assert query_scalar(cur, "SELECT SUM(n) FROM lfctest") == n_rows + n_updates_performed
assert query_scalar(cur, "SELECT SUM(n) FROM lfctest") == n_total_updates + n_rows

View File

@@ -11,6 +11,8 @@ from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, StorageScrubb
from fixtures.pageserver.common_types import parse_layer_file_name
from fixtures.pageserver.utils import (
assert_prefix_empty,
poll_for_remote_storage_iterations,
tenant_delete_wait_completed,
wait_for_upload_queue_empty,
)
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage
@@ -361,7 +363,8 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
# Check that deletion works properly on a tenant that was live-migrated
# (reproduce https://github.com/neondatabase/neon/issues/6802)
pageserver_b.http_client().tenant_delete(tenant_id)
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
tenant_delete_wait_completed(pageserver_b.http_client(), tenant_id, iterations)
def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
@@ -549,7 +552,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
)
log.info("Deleting tenant...")
ps_attached.http_client().tenant_delete(tenant_id)
tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10)
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,

View File

@@ -23,11 +23,11 @@ if TYPE_CHECKING:
# Run the main PostgreSQL regression tests, in src/test/regress.
#
@pytest.mark.timeout(600)
@pytest.mark.parametrize("shard_count", [None, 4])
def test_pg_regress(
neon_env_builder: NeonEnvBuilder,
test_output_dir: Path,
build_type: str,
pg_bin: PgBin,
capsys: CaptureFixture[str],
base_dir: Path,
@@ -43,6 +43,10 @@ def test_pg_regress(
if shard_count is not None:
neon_env_builder.num_pageservers = shard_count
if build_type == "debug":
# Disable vectored read path cross validation since it makes the test time out.
neon_env_builder.pageserver_config_override = "validate_vectored_get=false"
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
neon_env_builder.enable_scrub_on_exit()
env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)

View File

@@ -6,6 +6,7 @@ from fixtures.neon_fixtures import NeonEnv
def test_physical_replication(neon_simple_env: NeonEnv):
env = neon_simple_env
n_records = 100000
with env.endpoints.create_start(
branch_name="main",
endpoint_id="primary",
@@ -21,20 +22,8 @@ def test_physical_replication(neon_simple_env: NeonEnv):
with p_con.cursor() as p_cur:
with secondary.connect() as s_con:
with s_con.cursor() as s_cur:
runtime_secs = 30
started_at = time.time()
pk = 0
while True:
pk += 1
now = time.time()
if now - started_at > runtime_secs:
break
for pk in range(n_records):
p_cur.execute("insert into t (pk) values (%s)", (pk,))
# an earlier version of this test was based on a fixed number of loop iterations
# and selected for pk=(random.randrange(1, fixed number of loop iterations)).
# => the probability of selection for a value that was never inserted changed from 99.9999% to 0% over the course of the test.
#
# We changed the test to where=(random.randrange(1, 2*pk)), which means the probability is now fixed to 50%.
s_cur.execute(
"select * from t where pk=%s", (random.randrange(1, 2 * pk),)
"select * from t where pk=%s", (random.randrange(1, n_records),)
)

View File

@@ -11,6 +11,8 @@ from fixtures.pageserver.utils import (
MANY_SMALL_LAYERS_TENANT_CONFIG,
assert_prefix_empty,
enable_remote_storage_versioning,
poll_for_remote_storage_iterations,
tenant_delete_wait_completed,
wait_for_upload,
)
from fixtures.remote_storage import RemoteStorageKind, s3_storage
@@ -81,7 +83,8 @@ def test_tenant_s3_restore(
assert (
ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
), "tenant removed before we deletion was issued"
ps_http.tenant_delete(tenant_id)
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
ps_http.deletion_queue_flush(execute=True)
assert (
ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0

View File

@@ -24,6 +24,7 @@ from fixtures.pageserver.utils import (
enable_remote_storage_versioning,
list_prefix,
remote_storage_delete_key,
tenant_delete_wait_completed,
timeline_delete_wait_completed,
)
from fixtures.pg_version import PgVersion
@@ -39,7 +40,7 @@ from werkzeug.wrappers.response import Response
def get_node_shard_counts(env: NeonEnv, tenant_ids):
counts: defaultdict[int, int] = defaultdict(int)
counts: defaultdict[str, int] = defaultdict(int)
for tid in tenant_ids:
for shard in env.storage_controller.locate(tid):
counts[shard["node_id"]] += 1
@@ -157,7 +158,7 @@ def test_storage_controller_smoke(
# Delete all the tenants
for tid in tenant_ids:
env.storage_controller.pageserver_api().tenant_delete(tid)
tenant_delete_wait_completed(env.storage_controller.pageserver_api(), tid, 10)
env.storage_controller.consistency_check()
@@ -1383,8 +1384,7 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
tenant_id = env.initial_tenant
env.storage_controller.allowed_errors.extend(
[
".*Exclusive lock by.*",
".*Shared lock by.*",
".*Lock on.*",
".*Scheduling is disabled by policy.*",
f".*Operation TimelineCreate on key {tenant_id} has waited.*",
]
@@ -1416,23 +1416,9 @@ def test_lock_time_tracing(neon_env_builder: NeonEnvBuilder):
)
thread_update_tenant_policy.join()
env.storage_controller.assert_log_contains("Exclusive lock by UpdatePolicy was held for")
_, last_log_cursor = env.storage_controller.assert_log_contains(
f"Operation TimelineCreate on key {tenant_id} has waited"
)
# Test out shared lock
env.storage_controller.configure_failpoints(
("tenant-create-timeline-shared-lock", "return(31000)")
)
timeline_id = TimelineId.generate()
# This will hold the shared lock for enough time to cause an warning
env.storage_controller.pageserver_api().timeline_create(
pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id
)
env.storage_controller.assert_log_contains("Lock on UpdatePolicy was held for")
env.storage_controller.assert_log_contains(
"Shared lock by TimelineCreate was held for", offset=last_log_cursor
f"Operation TimelineCreate on key {tenant_id} has waited"
)
@@ -1516,156 +1502,3 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto
workload = Workload(env, tenant_id, timeline, branch_name=branch)
workload.expect_rows = expect_rows
workload.validate()
def retryable_node_operation(op, ps_id, max_attempts, backoff):
while max_attempts > 0:
try:
op(ps_id)
return
except StorageControllerApiException as e:
max_attempts -= 1
log.info(f"Operation failed ({max_attempts} attempts left): {e}")
if max_attempts == 0:
raise e
time.sleep(backoff)
def poll_node_status(env, node_id, desired_scheduling_policy, max_attempts, backoff):
log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
while max_attempts > 0:
try:
status = env.storage_controller.node_status(node_id)
policy = status["scheduling"]
if policy == desired_scheduling_policy:
return
else:
max_attempts -= 1
log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")
if max_attempts == 0:
raise AssertionError(
f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
)
time.sleep(backoff)
except StorageControllerApiException as e:
max_attempts -= 1
log.info(f"Status call failed ({max_attempts} retries left): {e}")
if max_attempts == 0:
raise e
time.sleep(backoff)
def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
"""
Graceful reststart of storage controller clusters use the drain and
fill hooks in order to migrate attachments away from pageservers before
restarting. In practice, Ansible will drive this process.
"""
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_configs()
env.start()
tenant_count = 5
shard_count_per_tenant = 8
total_shards = tenant_count * shard_count_per_tenant
tenant_ids = []
for _ in range(0, tenant_count):
tid = TenantId.generate()
tenant_ids.append(tid)
env.neon_cli.create_tenant(
tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
)
# Give things a chance to settle.
env.storage_controller.reconcile_until_idle(timeout_secs=30)
nodes = env.storage_controller.node_list()
assert len(nodes) == 2
def assert_shard_counts_balanced(env: NeonEnv, shard_counts, total_shards):
# Assert that all nodes have some attached shards
assert len(shard_counts) == len(env.pageservers)
min_shard_count = min(shard_counts.values())
max_shard_count = max(shard_counts.values())
flake_factor = 5 / 100
assert max_shard_count - min_shard_count <= int(total_shards * flake_factor)
# Perform a graceful rolling restart
for ps in env.pageservers:
retryable_node_operation(
lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
)
poll_node_status(env, ps.id, "PauseForRestart", max_attempts=6, backoff=5)
shard_counts = get_node_shard_counts(env, tenant_ids)
log.info(f"Shard counts after draining node {ps.id}: {shard_counts}")
# Assert that we've drained the node
assert shard_counts[ps.id] == 0
# Assert that those shards actually went somewhere
assert sum(shard_counts.values()) == total_shards
ps.restart()
poll_node_status(env, ps.id, "Active", max_attempts=10, backoff=1)
retryable_node_operation(
lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
)
poll_node_status(env, ps.id, "Active", max_attempts=6, backoff=5)
shard_counts = get_node_shard_counts(env, tenant_ids)
log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")
assert_shard_counts_balanced(env, shard_counts, total_shards)
# Now check that shards are reasonably balanced
shard_counts = get_node_shard_counts(env, tenant_ids)
log.info(f"Shard counts after rolling restart: {shard_counts}")
assert_shard_counts_balanced(env, shard_counts, total_shards)
def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_pageservers = 2
env = neon_env_builder.init_configs()
env.start()
tenant_count = 5
shard_count_per_tenant = 8
tenant_ids = []
for _ in range(0, tenant_count):
tid = TenantId.generate()
tenant_ids.append(tid)
env.neon_cli.create_tenant(
tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
)
# See sleep comment in the test above.
time.sleep(2)
nodes = env.storage_controller.node_list()
assert len(nodes) == 2
env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(2000)"))
ps_id_to_drain = env.pageservers[0].id
retryable_node_operation(
lambda ps_id: env.storage_controller.node_drain(ps_id),
ps_id_to_drain,
max_attempts=3,
backoff=2,
)
poll_node_status(env, ps_id_to_drain, "Draining", max_attempts=6, backoff=2)
env.storage_controller.cancel_node_drain(ps_id_to_drain)
poll_node_status(env, ps_id_to_drain, "Active", max_attempts=6, backoff=2)

View File

@@ -1,11 +1,17 @@
import concurrent.futures
import enum
import os
import shutil
from threading import Thread
import pytest
from fixtures.common_types import Lsn, TenantId, TimelineId
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
PgBin,
StorageScrubber,
last_flush_lsn_upload,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.http import PageserverApiException
@@ -13,33 +19,18 @@ from fixtures.pageserver.utils import (
MANY_SMALL_LAYERS_TENANT_CONFIG,
assert_prefix_empty,
assert_prefix_not_empty,
poll_for_remote_storage_iterations,
tenant_delete_wait_completed,
wait_for_upload,
wait_tenant_status_404,
wait_until_tenant_active,
wait_until_tenant_state,
)
from fixtures.remote_storage import RemoteStorageKind, s3_storage
from fixtures.remote_storage import RemoteStorageKind, available_s3_storages, s3_storage
from fixtures.utils import run_pg_bench_small, wait_until
from requests.exceptions import ReadTimeout
def error_tolerant_delete(ps_http, tenant_id):
"""
For tests that inject 500 errors, we must retry repeatedly when issuing deletions
"""
while True:
try:
ps_http.tenant_delete(tenant_id=tenant_id)
except PageserverApiException as e:
if e.status_code == 500:
# This test uses failure injection, which can produce 500s as the pageserver expects
# the object store to always be available, and the ListObjects during deletion is generally
# an infallible operation
assert "simulated failure of remote operation" in e.message
else:
raise
else:
# Success, drop out
break
def test_tenant_delete_smoke(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
@@ -68,7 +59,21 @@ def test_tenant_delete_smoke(
# Check that deleting a non-existent tenant gives the expected result: this is a loop because we
# may need to retry on some remote storage errors injected by the test harness
error_tolerant_delete(ps_http, tenant_id)
while True:
try:
ps_http.tenant_delete(tenant_id=tenant_id)
except PageserverApiException as e:
if e.status_code == 500:
# This test uses failure injection, which can produce 500s as the pageserver expects
# the object store to always be available, and the ListObjects during deletion is generally
# an infallible operation
assert "simulated failure of remote operation" in e.message
elif e.status_code == 404:
# This is our expected result: trying to erase a non-existent tenant gives us 404
assert "NotFound" in e.message
break
else:
raise
env.neon_cli.create_tenant(
tenant_id=tenant_id,
@@ -103,8 +108,10 @@ def test_tenant_delete_smoke(
# Upload a heatmap so that we exercise deletion of that too
ps_http.tenant_heatmap_upload(tenant_id)
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 2
error_tolerant_delete(ps_http, tenant_id)
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
tenant_path = env.pageserver.tenant_dir(tenant_id)
@@ -122,7 +129,286 @@ def test_tenant_delete_smoke(
# Deletion updates the tenant count: the one default tenant remains
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0
class Check(enum.Enum):
RETRY_WITHOUT_RESTART = enum.auto()
RETRY_WITH_RESTART = enum.auto()
FAILPOINTS = [
"tenant-delete-before-shutdown",
"tenant-delete-before-create-remote-mark",
"tenant-delete-before-create-local-mark",
"tenant-delete-before-background",
"tenant-delete-before-polling-ongoing-deletions",
"tenant-delete-before-cleanup-remaining-fs-traces",
"tenant-delete-before-remove-timelines-dir",
"tenant-delete-before-remove-deleted-mark",
"tenant-delete-before-remove-tenant-dir",
# Some failpoints from timeline deletion
"timeline-delete-before-index-deleted-at",
"timeline-delete-before-rm",
"timeline-delete-before-index-delete",
]
FAILPOINTS_BEFORE_BACKGROUND = [
"timeline-delete-before-schedule",
"tenant-delete-before-shutdown",
"tenant-delete-before-create-remote-mark",
"tenant-delete-before-create-local-mark",
"tenant-delete-before-background",
]
def combinations():
result = []
remotes = available_s3_storages()
for remote_storage_kind in remotes:
for delete_failpoint in FAILPOINTS:
# Simulate failures for only one type of remote storage
# to avoid log pollution and make tests run faster
if remote_storage_kind is RemoteStorageKind.MOCK_S3:
simulate_failures = True
else:
simulate_failures = False
result.append((remote_storage_kind, delete_failpoint, simulate_failures))
return result
@pytest.mark.parametrize("check", list(Check))
@pytest.mark.parametrize("remote_storage_kind, failpoint, simulate_failures", combinations())
def test_delete_tenant_exercise_crash_safety_failpoints(
neon_env_builder: NeonEnvBuilder,
remote_storage_kind: RemoteStorageKind,
failpoint: str,
simulate_failures: bool,
check: Check,
pg_bin: PgBin,
):
if simulate_failures:
neon_env_builder.pageserver_config_override = "test_remote_failures=1"
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
tenant_id = env.initial_tenant
env.pageserver.allowed_errors.extend(
[
# From deletion polling
f".*NotFound: tenant {env.initial_tenant}.*",
# allow errors caused by failpoints
f".*failpoint: {failpoint}",
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
# We may leave some upload tasks in the queue. They're likely deletes.
# For uploads we explicitly wait with `last_flush_lsn_upload` below.
# So by ignoring these instead of waiting for empty upload queue
# we execute more distinct code paths.
'.*stopping left-over name="remote upload".*',
# an on-demand is cancelled by shutdown
".*initial size calculation failed: downloading failed, possibly for shutdown",
]
)
if simulate_failures:
env.pageserver.allowed_errors.append(
# The deletion queue will complain when it encounters simulated S3 errors
".*deletion executor: DeleteObjects request failed.*",
)
ps_http = env.pageserver.http_client()
timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id)
with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint:
# generate enough layers
run_pg_bench_small(pg_bin, endpoint.connstr())
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
ps_http.configure_failpoints((failpoint, "return"))
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
# These failpoints are earlier than background task is spawned.
# so they result in api request failure.
if failpoint in FAILPOINTS_BEFORE_BACKGROUND:
with pytest.raises(PageserverApiException, match=failpoint):
ps_http.tenant_delete(tenant_id)
else:
ps_http.tenant_delete(tenant_id)
tenant_info = wait_until_tenant_state(
pageserver_http=ps_http,
tenant_id=tenant_id,
expected_state="Broken",
iterations=iterations,
)
reason = tenant_info["state"]["data"]["reason"]
log.info(f"tenant broken: {reason}")
# failpoint may not be the only error in the stack
assert reason.endswith(f"failpoint: {failpoint}"), reason
if check is Check.RETRY_WITH_RESTART:
env.pageserver.restart()
if failpoint in (
"tenant-delete-before-shutdown",
"tenant-delete-before-create-remote-mark",
):
wait_until_tenant_active(
ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25
)
tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
else:
# Pageserver should've resumed deletion after restart.
wait_tenant_status_404(ps_http, tenant_id, iterations=iterations + 10)
elif check is Check.RETRY_WITHOUT_RESTART:
# this should succeed
# this also checks that delete can be retried even when tenant is in Broken state
ps_http.configure_failpoints((failpoint, "off"))
tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
tenant_dir = env.pageserver.tenant_dir(tenant_id)
# Check local is empty
assert not tenant_dir.exists()
# Check remote is empty
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
allowed_postfix="initdb.tar.zst",
)
def test_tenant_delete_is_resumed_on_attach(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
remote_storage_kind = s3_storage()
neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
env.pageserver.allowed_errors.append(
# lucky race with stopping from flushing a layer we fail to schedule any uploads
".*layer flush task.+: could not flush frozen layer: update_metadata_file"
)
tenant_id = env.initial_tenant
ps_http = env.pageserver.http_client()
# create two timelines
for timeline in ["first", "second"]:
timeline_id = env.neon_cli.create_timeline(timeline, tenant_id=tenant_id)
with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
run_pg_bench_small(pg_bin, endpoint.connstr())
wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
# sanity check, data should be there
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
# failpoint before we remove index_part from s3
failpoint = "timeline-delete-before-index-delete"
ps_http.configure_failpoints((failpoint, "return"))
env.pageserver.allowed_errors.extend(
(
# allow errors caused by failpoints
f".*failpoint: {failpoint}",
# From deletion polling
f".*NotFound: tenant {env.initial_tenant}.*",
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
# error from http response is also logged
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
'.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
)
)
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
ps_http.tenant_delete(tenant_id)
tenant_info = wait_until_tenant_state(
pageserver_http=ps_http,
tenant_id=tenant_id,
expected_state="Broken",
iterations=iterations,
)
assert_prefix_not_empty(
neon_env_builder.pageserver_remote_storage,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
reason = tenant_info["state"]["data"]["reason"]
# failpoint may not be the only error in the stack
assert reason.endswith(f"failpoint: {failpoint}"), reason
# now we stop pageserver and remove local tenant state
env.endpoints.stop_all()
env.pageserver.stop()
dir_to_clear = env.pageserver.tenant_dir()
shutil.rmtree(dir_to_clear)
os.mkdir(dir_to_clear)
env.pageserver.start()
# now we call attach
env.pageserver.tenant_attach(tenant_id=tenant_id)
# delete should be resumed
wait_tenant_status_404(ps_http, tenant_id, iterations)
# we shouldn've created tenant dir on disk
tenant_path = env.pageserver.tenant_dir(tenant_id)
assert not tenant_path.exists()
ps_http.deletion_queue_flush(execute=True)
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonEnvBuilder):
@@ -197,6 +483,105 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
deletion.join()
def test_tenant_delete_concurrent(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
):
"""
Validate that concurrent delete requests to the same tenant behave correctly:
exactly one should execute: the rest should give 202 responses but not start
another deletion.
This is a reproducer for https://github.com/neondatabase/neon/issues/5936
"""
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
ps_http = env.pageserver.http_client()
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
# Populate some data
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
run_pg_bench_small(pg_bin, endpoint.connstr())
last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
env.pageserver.allowed_errors.extend(
[
# lucky race with stopping from flushing a layer we fail to schedule any uploads
".*layer flush task.+: could not flush frozen layer: update_metadata_file",
]
)
BEFORE_REMOVE_FAILPOINT = "tenant-delete-before-map-remove"
BEFORE_RUN_FAILPOINT = "tenant-delete-before-run"
# We will let the initial delete run until right before it would remove
# the tenant's TenantSlot. This pauses it in a state where the tenant
# is visible in Stopping state, and concurrent requests should fail with 4xx.
ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "pause"))
def delete_tenant():
return ps_http.tenant_delete(tenant_id)
def hit_remove_failpoint():
return env.pageserver.assert_log_contains(f"at failpoint {BEFORE_REMOVE_FAILPOINT}")[1]
def hit_run_failpoint():
env.pageserver.assert_log_contains(f"at failpoint {BEFORE_RUN_FAILPOINT}")
with concurrent.futures.ThreadPoolExecutor() as executor:
background_200_req = executor.submit(delete_tenant)
assert background_200_req.result(timeout=10).status_code == 202
# Wait until the first request completes its work and is blocked on removing
# the TenantSlot from tenant manager.
log_cursor = wait_until(100, 0.1, hit_remove_failpoint)
assert log_cursor is not None
# Start another request: this should succeed without actually entering the deletion code
ps_http.tenant_delete(tenant_id)
assert not env.pageserver.log_contains(
f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
)
# Start another background request, which will pause after acquiring a TenantSlotGuard
# but before completing.
ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "pause"))
background_4xx_req = executor.submit(delete_tenant)
wait_until(100, 0.1, hit_run_failpoint)
# The TenantSlot is still present while the original request is hung before
# final removal
assert (
ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
)
# Permit the original request to run to success
ps_http.configure_failpoints((BEFORE_REMOVE_FAILPOINT, "off"))
# Permit the duplicate background request to run to completion and fail.
ps_http.configure_failpoints((BEFORE_RUN_FAILPOINT, "off"))
background_4xx_req.result(timeout=10)
assert not env.pageserver.log_contains(
f"at failpoint {BEFORE_RUN_FAILPOINT}", offset=log_cursor
)
# Physical deletion should have happened
assert_prefix_empty(
neon_env_builder.pageserver_remote_storage,
prefix="/".join(
(
"tenants",
str(tenant_id),
)
),
)
# Zero tenants remain (we deleted the default tenant)
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 0
assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "inprogress"}) == 0
def test_tenant_delete_races_timeline_creation(
neon_env_builder: NeonEnvBuilder,
pg_bin: PgBin,
@@ -289,7 +674,9 @@ def test_tenant_delete_races_timeline_creation(
# Disable the failpoint and wait for deletion to finish
ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
ps_http.tenant_delete(tenant_id)
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
tenant_delete_wait_completed(ps_http, tenant_id, iterations, ignore_errors=True)
# Physical deletion should have happened
assert_prefix_empty(
@@ -340,7 +727,8 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder)
env.start()
ps_http = env.pageserver.http_client()
ps_http.tenant_delete(tenant_id)
iterations = poll_for_remote_storage_iterations(remote_storage_kind)
tenant_delete_wait_completed(ps_http, tenant_id, iterations)
env.stop()
scrubber.scan_metadata()

View File

@@ -344,6 +344,56 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
pageserver_http.timeline_gc(tenant_id, timeline_id, 0)
# Creates and ignores a tenant, then detaches it: first, with no parameters (should fail),
# then with parameters to force ignored tenant detach (should not fail).
def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
env = neon_simple_env
client = env.pageserver.http_client()
# create a new tenant
tenant_id, _ = env.neon_cli.create_tenant()
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
# assert tenant exists on disk
assert env.pageserver.tenant_dir(tenant_id).exists()
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
# we rely upon autocommit after each statement
endpoint.safe_psql_many(
queries=[
"CREATE TABLE t(key int primary key, value text)",
"INSERT INTO t SELECT generate_series(1,100000), 'payload'",
]
)
# ignore tenant
client.tenant_ignore(tenant_id)
env.pageserver.allowed_errors.append(".*NotFound: tenant .*")
# ensure tenant couldn't be detached without the special flag for ignored tenant
log.info("detaching ignored tenant WITHOUT required flag")
with pytest.raises(
expected_exception=PageserverApiException, match=f"NotFound: tenant {tenant_id}"
):
client.tenant_detach(tenant_id)
log.info("tenant detached failed as expected")
# ensure tenant is detached with ignore state
log.info("detaching ignored tenant with required flag")
client.tenant_detach(tenant_id, True)
log.info("ignored tenant detached without error")
# check that nothing is left on disk for deleted tenant
assert not env.pageserver.tenant_dir(tenant_id).exists()
# assert the tenant does not exists in the Pageserver
tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
assert (
tenant_id not in tenants_after_detach
), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory"
# Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach.
# Tenant should be detached without issues.
def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
@@ -450,6 +500,153 @@ def test_detach_while_attaching(
cur.execute("SELECT COUNT(*) FROM foo")
# Tests that `ignore` and `get` operations' combination is able to remove and restore the tenant in pageserver's memory.
# * writes some data into tenant's timeline
# * ensures it's synced with the remote storage
# * `ignore` the tenant
# * verify that ignored tenant files are generally unchanged, only an ignored mark had appeared
# * verify the ignored tenant is gone from pageserver's memory
# * restart the pageserver and verify that ignored tenant is still not loaded
# * `load` the same tenant
# * ensure that it's status is `Active` and it's present in pageserver's memory with all timelines
def test_ignored_tenant_reattach(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
ignored_tenant_id, _ = env.neon_cli.create_tenant()
tenant_dir = env.pageserver.tenant_dir(ignored_tenant_id)
tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
tenants_before_ignore.sort()
timelines_before_ignore = [
timeline["timeline_id"]
for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
]
files_before_ignore = [tenant_path for tenant_path in tenant_dir.glob("**/*")]
# ignore the tenant and veirfy it's not present in pageserver replies, with its files still on disk
pageserver_http.tenant_ignore(ignored_tenant_id)
files_after_ignore_with_retain = [tenant_path for tenant_path in tenant_dir.glob("**/*")]
new_files = set(files_after_ignore_with_retain) - set(files_before_ignore)
disappeared_files = set(files_before_ignore) - set(files_after_ignore_with_retain)
assert (
len(disappeared_files) == 0
), f"Tenant ignore should not remove files from disk, missing: {disappeared_files}"
assert (
len(new_files) == 1
), f"Only tenant ignore file should appear on disk but got: {new_files}"
tenants_after_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
assert ignored_tenant_id not in tenants_after_ignore, "Ignored tenant should be missing"
assert len(tenants_after_ignore) + 1 == len(
tenants_before_ignore
), "Only ignored tenant should be missing"
# restart the pageserver to ensure we don't load the ignore timeline
env.pageserver.stop()
env.pageserver.start()
tenants_after_restart = [tenant["id"] for tenant in pageserver_http.tenant_list()]
tenants_after_restart.sort()
assert (
tenants_after_restart == tenants_after_ignore
), "Ignored tenant should not be reloaded after pageserver restart"
# now, load it from the local files and expect it works
env.pageserver.tenant_load(tenant_id=ignored_tenant_id)
wait_until_tenant_state(pageserver_http, ignored_tenant_id, "Active", 5)
tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
tenants_after_attach.sort()
assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"
timelines_after_ignore = [
timeline["timeline_id"]
for timeline in pageserver_http.timeline_list(tenant_id=ignored_tenant_id)
]
assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"
# Tests that it's possible to `load` tenants with missing layers and get them restored:
# * writes some data into tenant's timeline
# * ensures it's synced with the remote storage
# * `ignore` the tenant
# * removes all timeline's local layers
# * `load` the same tenant
# * ensure that it's status is `Active`
# * check that timeline data is restored
def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
endpoint = env.endpoints.create_start("main")
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
data_id = 1
data_secret = "very secret secret"
insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
tenants_before_ignore = [tenant["id"] for tenant in pageserver_http.tenant_list()]
tenants_before_ignore.sort()
timelines_before_ignore = [
timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
]
# ignore the tenant and remove its layers
pageserver_http.tenant_ignore(tenant_id)
timeline_dir = env.pageserver.timeline_dir(tenant_id, timeline_id)
layers_removed = False
for dir_entry in timeline_dir.iterdir():
if dir_entry.name.startswith("00000"):
# Looks like a layer file. Remove it
dir_entry.unlink()
layers_removed = True
assert layers_removed, f"Found no layers for tenant {timeline_dir}"
# now, load it from the local files and expect it to work due to remote storage restoration
env.pageserver.tenant_load(tenant_id=tenant_id)
wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
tenants_after_attach = [tenant["id"] for tenant in pageserver_http.tenant_list()]
tenants_after_attach.sort()
assert tenants_after_attach == tenants_before_ignore, "Should have all tenants back"
timelines_after_ignore = [
timeline["timeline_id"] for timeline in pageserver_http.timeline_list(tenant_id=tenant_id)
]
assert timelines_before_ignore == timelines_after_ignore, "Should have all timelines back"
endpoint.stop()
endpoint.start()
ensure_test_data(data_id, data_secret, endpoint)
# Tests that attach is never working on a tenant, ignored or not, as long as it's not absent locally
# Similarly, tests that it's not possible to schedule a `load` for tenat that's not ignored.
def test_load_negatives(neon_env_builder: NeonEnvBuilder):
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
env.endpoints.create_start("main")
tenant_id = env.initial_tenant
env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
with pytest.raises(
expected_exception=PageserverApiException,
match=f"tenant {tenant_id} already exists, state: Active",
):
env.pageserver.tenant_load(tenant_id)
pageserver_http.tenant_ignore(tenant_id)
def test_detach_while_activating(
neon_env_builder: NeonEnvBuilder,
):
@@ -573,7 +770,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
wait_until(10, 0.5, found_broken)
client.tenant_detach(env.initial_tenant)
client.tenant_ignore(env.initial_tenant)
def found_cleaned_up():
m = client.get_metrics()
@@ -585,7 +782,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
wait_until(10, 0.5, found_cleaned_up)
env.pageserver.tenant_attach(env.initial_tenant)
env.pageserver.tenant_load(env.initial_tenant)
def found_active():
m = client.get_metrics()

View File

@@ -15,6 +15,7 @@ from fixtures.pageserver.utils import (
assert_tenant_state,
wait_for_last_record_lsn,
wait_for_upload,
wait_tenant_status_404,
)
from fixtures.remote_storage import (
LocalFsStorage,
@@ -347,6 +348,9 @@ def test_tenant_relocation(
# is no longer involved, and if it is, we will see the error
origin_http.tenant_detach(tenant_id)
# Wait a little, so that the detach operation has time to finish.
wait_tenant_status_404(origin_http, tenant_id, iterations=100, interval=1)
post_migration_check(ep_main, 500500, old_local_path_main)
post_migration_check(ep_second, 1001000, old_local_path_second)

View File

@@ -15,6 +15,7 @@ from fixtures.neon_fixtures import (
)
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
from fixtures.pageserver.utils import (
tenant_delete_wait_completed,
timeline_delete_wait_completed,
wait_until_tenant_active,
)
@@ -668,7 +669,7 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
),
)
client.tenant_delete(env.initial_tenant)
tenant_delete_wait_completed(client, env.initial_tenant, 10)
client.configure_failpoints((failpoint, "off"))

View File

@@ -14,7 +14,7 @@ from fixtures.neon_fixtures import (
wait_for_last_flush_lsn,
)
from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
from fixtures.pageserver.utils import wait_timeline_detail_404
from fixtures.pageserver.utils import wait_tenant_status_404, wait_timeline_detail_404
from fixtures.remote_storage import LocalFsStorage
from fixtures.utils import assert_pageserver_backups_equal
@@ -578,6 +578,7 @@ def test_timeline_ancestor_errors(neon_env_builder: NeonEnvBuilder):
assert info.value.status_code == 400
client.tenant_delete(env.initial_tenant)
wait_tenant_status_404(client, env.initial_tenant, 10, 1)
with pytest.raises(PageserverApiException) as e:
client.detach_ancestor(env.initial_tenant, first_branch)

View File

@@ -26,6 +26,7 @@ from fixtures.pageserver.utils import (
assert_tenant_state,
timeline_delete_wait_completed,
wait_for_upload_queue_empty,
wait_tenant_status_404,
wait_until_tenant_active,
)
from fixtures.pg_version import PgVersion
@@ -863,33 +864,39 @@ def delete_lazy_activating(
):
pageserver_http = pageserver.http_client()
# Deletion itself won't complete due to our failpoint: Tenant::shutdown can't complete while calculating
# logical size is paused in a failpoint. So instead we will use a log observation to check that
# on-demand activation was triggered by the tenant deletion
log_match = f".*attach{{tenant_id={delete_tenant_id} shard_id=0000 gen=[0-9a-f]+}}: Activating tenant \\(on-demand\\).*"
if expect_attaching:
assert pageserver_http.tenant_status(delete_tenant_id)["state"]["slug"] == "Attaching"
with concurrent.futures.ThreadPoolExecutor() as executor:
log.info("Starting background delete")
def shutting_down():
assert pageserver.log_contains(".*Waiting for timelines.*") is not None
def activated_on_demand():
assert pageserver.log_contains(log_match) is not None
def delete_tenant():
pageserver_http.tenant_delete(delete_tenant_id)
background_delete = executor.submit(delete_tenant)
# We expect deletion to enter shutdown of the tenant even though it's in the attaching state
log.info(f"Waiting for activation message '{log_match}'")
try:
# Deletion will get to the point in shutdown where it's waiting for timeline shutdown, then
# hang because of our failpoint blocking activation.
wait_until(10, 1, shutting_down)
wait_until(10, 1, activated_on_demand)
finally:
log.info("Clearing failpoint")
pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
# Deletion should complete successfully now that failpoint is unblocked and shutdown can complete
# Deletion should complete successfully now that failpoint is unblocked
log.info("Joining background delete")
background_delete.result(timeout=10)
# Poll for deletion to complete
wait_tenant_status_404(pageserver_http, tenant_id=delete_tenant_id, iterations=40)
def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder):
"""

View File

@@ -25,7 +25,6 @@ axum = { version = "0.6", features = ["ws"] }
base64 = { version = "0.21", features = ["alloc"] }
base64ct = { version = "1", default-features = false, features = ["std"] }
bytes = { version = "1", features = ["serde"] }
camino = { version = "1", default-features = false, features = ["serde1"] }
chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
clap = { version = "4", features = ["derive", "string"] }
clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }