Compare commits

..

2 Commits

Author SHA1 Message Date
George MacKerron
bfb86b2e6e Briefer ALPN error message 2025-07-30 14:19:46 +01:00
George MacKerron
3bd2486778 Check postgresql ALPN value for direct SSL connections 2025-07-30 14:07:37 +01:00
148 changed files with 1449 additions and 9164 deletions

1
.gitignore vendored
View File

@@ -15,7 +15,6 @@ neon.iml
/.neon
/integration_tests/.neon
compaction-suite-results.*
pgxn/neon/communicator/communicator_bindings.h
docker-compose/docker-compose-parallel.yml
# Coverage

69
Cargo.lock generated
View File

@@ -259,17 +259,6 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
[[package]]
name = "atomic_enum"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99e1aca718ea7b89985790c94aad72d77533063fe00bc497bb79a7c2dae6a661"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
]
[[package]]
name = "autocfg"
version = "1.1.0"
@@ -1307,31 +1296,13 @@ dependencies = [
name = "communicator"
version = "0.1.0"
dependencies = [
"atomic_enum",
"axum",
"bytes",
"cbindgen",
"clashmap",
"fiemap",
"http 1.3.1",
"itertools 0.10.5",
"libc",
"measured",
"neon-shmem",
"nix 0.30.1",
"pageserver_api",
"pageserver_client_grpc",
"pageserver_page_api",
"prometheus",
"prost 0.13.5",
"strum_macros",
"thiserror 1.0.69",
"tokio",
"tokio-pipe",
"tonic",
"tracing",
"tracing-subscriber",
"uring-common",
"utils",
"workspace_hack",
]
@@ -1672,9 +1643,9 @@ dependencies = [
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
version = "0.8.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
[[package]]
name = "crossterm"
@@ -2338,15 +2309,6 @@ version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d"
[[package]]
name = "fiemap"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92e387bc8b3342ba5cd115fb566e6bf2c82562433dffcecbc2474265cf8a875a"
dependencies = [
"bitflags 2.8.0",
]
[[package]]
name = "filetime"
version = "0.2.22"
@@ -4429,16 +4391,13 @@ version = "0.1.0"
dependencies = [
"anyhow",
"async-trait",
"axum",
"bytes",
"camino",
"clap",
"futures",
"hdrhistogram",
"http 1.3.1",
"humantime",
"humantime-serde",
"metrics",
"pageserver_api",
"pageserver_client",
"pageserver_client_grpc",
@@ -4528,7 +4487,6 @@ dependencies = [
"pageserver_client",
"pageserver_compaction",
"pageserver_page_api",
"peekable",
"pem",
"pin-project-lite",
"postgres-protocol",
@@ -4542,7 +4500,6 @@ dependencies = [
"pprof",
"pq_proto",
"procfs",
"prost 0.13.5",
"rand 0.9.1",
"range-set-blaze",
"regex",
@@ -4837,15 +4794,6 @@ dependencies = [
"sha2",
]
[[package]]
name = "peekable"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "225f9651e475709164f871dc2f5724956be59cb9edb055372ffeeab01ec2d20b"
dependencies = [
"smallvec",
]
[[package]]
name = "pem"
version = "3.0.3"
@@ -7698,16 +7646,6 @@ dependencies = [
"syn 2.0.100",
]
[[package]]
name = "tokio-pipe"
version = "0.2.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f213a84bffbd61b8fa0ba8a044b4bbe35d471d0b518867181e82bd5c15542784"
dependencies = [
"libc",
"tokio",
]
[[package]]
name = "tokio-postgres"
version = "0.7.10"
@@ -9075,8 +9013,8 @@ dependencies = [
"clap",
"clap_builder",
"const-oid",
"criterion",
"crossbeam-epoch",
"crossbeam-utils",
"crypto-bigint 0.5.5",
"der 0.7.8",
"deranged",
@@ -9119,6 +9057,7 @@ dependencies = [
"num-iter",
"num-rational",
"num-traits",
"once_cell",
"p256 0.13.2",
"parquet",
"portable-atomic",

View File

@@ -93,7 +93,6 @@ clap = { version = "4.0", features = ["derive", "env"] }
clashmap = { version = "1.0", features = ["raw-api"] }
comfy-table = "7.1"
const_format = "0.2"
crossbeam-utils = "0.8.21"
crc32c = "0.6"
diatomic-waker = { version = "0.2.3" }
either = "1.8"
@@ -153,7 +152,6 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
parquet_derive = "53"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pem = "3.0.3"
peekable = "0.3.0"
pin-project-lite = "0.2"
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
procfs = "0.16"
@@ -192,7 +190,6 @@ smallvec = "1.11"
smol_str = { version = "0.2.0", features = ["serde"] }
socket2 = "0.5"
spki = "0.7.3"
spin = "0.9.8"
strum = "0.26"
strum_macros = "0.26"
"subtle" = "2.5.0"
@@ -204,6 +201,7 @@ thiserror = "1.0"
tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
tokio = { version = "1.43.1", features = ["macros"] }
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
tokio-io-timeout = "1.2.0"
tokio-postgres-rustls = "0.12.0"
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
@@ -244,9 +242,6 @@ zeroize = "1.8"
env_logger = "0.11"
log = "0.4"
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }

View File

@@ -1 +1 @@
SELECT num_requested AS checkpoints_req FROM pg_catalog.pg_stat_checkpointer;
SELECT num_requested AS checkpoints_req FROM pg_stat_checkpointer;

View File

@@ -1 +1 @@
SELECT checkpoints_req FROM pg_catalog.pg_stat_bgwriter;
SELECT checkpoints_req FROM pg_stat_bgwriter;

View File

@@ -1 +1 @@
SELECT checkpoints_timed FROM pg_catalog.pg_stat_bgwriter;
SELECT checkpoints_timed FROM pg_stat_bgwriter;

View File

@@ -1 +1 @@
SELECT (neon.backpressure_throttling_time()::pg_catalog.float8 / 1000000) AS throttled;
SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled;

View File

@@ -1,4 +1,4 @@
SELECT CASE
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_replay_lsn() - '0/0')::pg_catalog.FLOAT8
ELSE (pg_catalog.pg_current_wal_lsn() - '0/0')::pg_catalog.FLOAT8
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
ELSE (pg_current_wal_lsn() - '0/0')::FLOAT8
END AS lsn;

View File

@@ -1,7 +1,7 @@
SELECT
(SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
-- These temporary snapshot files are renamed to the actual snapshot files
-- after they are completely built. We only WAL-log the completely built
-- snapshot files
(SELECT COUNT(*) FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
(SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;

View File

@@ -1,7 +1,7 @@
SELECT
(SELECT pg_catalog.current_setting('neon.timeline_id')) AS timeline_id,
(SELECT current_setting('neon.timeline_id')) AS timeline_id,
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
-- These temporary snapshot files are renamed to the actual snapshot files
-- after they are completely built. We only WAL-log the completely built
-- snapshot files
(SELECT COALESCE(pg_catalog.sum(size), 0) FROM pg_catalog.pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;
(SELECT COALESCE(sum(size), 0) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;

View File

@@ -1,9 +1,9 @@
SELECT
(SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
-- These temporary snapshot files are renamed to the actual snapshot files
-- after they are completely built. We only WAL-log the completely built
-- snapshot files
(SELECT COALESCE(pg_catalog.sum((pg_catalog.pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
FROM (SELECT * FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
(SELECT COALESCE(sum((pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
FROM (SELECT * FROM pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
) AS logical_snapshots_bytes;

View File

@@ -1 +1 @@
SELECT pg_catalog.current_setting('max_connections') AS max_connections;
SELECT current_setting('max_connections') as max_connections;

View File

@@ -1,4 +1,4 @@
SELECT datname database_name,
pg_catalog.age(datfrozenxid) frozen_xid_age
FROM pg_catalog.pg_database
age(datfrozenxid) frozen_xid_age
FROM pg_database
ORDER BY frozen_xid_age DESC LIMIT 10;

View File

@@ -1,4 +1,4 @@
SELECT datname database_name,
pg_catalog.mxid_age(datminmxid) min_mxid_age
FROM pg_catalog.pg_database
mxid_age(datminmxid) min_mxid_age
FROM pg_database
ORDER BY min_mxid_age DESC LIMIT 10;

View File

@@ -1,4 +1,4 @@
SELECT CASE
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_receive_lsn() - '0/0')::pg_catalog.FLOAT8
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
ELSE 0
END AS lsn;

View File

@@ -1 +1 @@
SELECT subenabled::pg_catalog.text AS enabled, pg_catalog.count(*) AS subscriptions_count FROM pg_catalog.pg_subscription GROUP BY subenabled;
SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled;

View File

@@ -1 +1 @@
SELECT datname, state, pg_catalog.count(*) AS count FROM pg_catalog.pg_stat_activity WHERE state <> '' GROUP BY datname, state;
SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state;

View File

@@ -1,5 +1,5 @@
SELECT pg_catalog.sum(pg_catalog.pg_database_size(datname)) AS total
FROM pg_catalog.pg_database
SELECT sum(pg_database_size(datname)) AS total
FROM pg_database
-- Ignore invalid databases, as we will likely have problems with
-- getting their size from the Pageserver.
WHERE datconnlimit != -2;

View File

@@ -3,6 +3,6 @@
-- minutes.
SELECT
x::pg_catalog.text AS duration_seconds,
x::text as duration_seconds,
neon.approximate_working_set_size_seconds(x) AS size
FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x);

View File

@@ -3,6 +3,6 @@
SELECT
x AS duration,
neon.approximate_working_set_size_seconds(extract('epoch' FROM x::pg_catalog.interval)::pg_catalog.int4) AS size FROM (
neon.approximate_working_set_size_seconds(extract('epoch' FROM x::interval)::int) AS size FROM (
VALUES ('5m'), ('15m'), ('1h')
) AS t (x);

View File

@@ -1 +1 @@
SELECT pg_catalog.pg_size_bytes(pg_catalog.current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit;
SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit;

View File

@@ -1,3 +1,3 @@
SELECT slot_name, (restart_lsn - '0/0')::pg_catalog.FLOAT8 AS restart_lsn
FROM pg_catalog.pg_replication_slots
SELECT slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
FROM pg_replication_slots
WHERE slot_type = 'logical';

View File

@@ -1 +1 @@
SELECT setting::pg_catalog.int4 AS max_cluster_size FROM pg_catalog.pg_settings WHERE name = 'neon.max_cluster_size';
SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size';

View File

@@ -1,13 +1,13 @@
-- We export stats for 10 non-system databases. Without this limit it is too
-- easy to abuse the system by creating lots of databases.
SELECT pg_catalog.pg_database_size(datname) AS db_size,
SELECT pg_database_size(datname) AS db_size,
deadlocks,
tup_inserted AS inserted,
tup_updated AS updated,
tup_deleted AS deleted,
datname
FROM pg_catalog.pg_stat_database
FROM pg_stat_database
WHERE datname IN (
SELECT datname FROM pg_database
-- Ignore invalid databases, as we will likely have problems with

View File

@@ -3,4 +3,4 @@
-- replay LSN may have advanced past the receive LSN we are using for the
-- calculation.
SELECT GREATEST(0, pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_last_wal_receive_lsn(), pg_catalog.pg_last_wal_replay_lsn())) AS replication_delay_bytes;
SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;

View File

@@ -1,5 +1,5 @@
SELECT
CASE
WHEN pg_catalog.pg_last_wal_receive_lsn() = pg_catalog.pg_last_wal_replay_lsn() THEN 0
ELSE GREATEST(0, EXTRACT (EPOCH FROM pg_catalog.now() - pg_catalog.pg_last_xact_replay_timestamp()))
WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
ELSE GREATEST(0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
END AS replication_delay_seconds;

View File

@@ -1,10 +1,10 @@
SELECT
slot_name,
pg_catalog.pg_wal_lsn_diff(
pg_wal_lsn_diff(
CASE
WHEN pg_catalog.pg_is_in_recovery() THEN pg_catalog.pg_last_wal_replay_lsn()
ELSE pg_catalog.pg_current_wal_lsn()
WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn()
ELSE pg_current_wal_lsn()
END,
restart_lsn)::pg_catalog.FLOAT8 AS retained_wal
FROM pg_catalog.pg_replication_slots
restart_lsn)::FLOAT8 AS retained_wal
FROM pg_replication_slots
WHERE active = false;

View File

@@ -4,4 +4,4 @@ SELECT
WHEN wal_status = 'lost' THEN 1
ELSE 0
END AS wal_is_lost
FROM pg_catalog.pg_replication_slots;
FROM pg_replication_slots;

View File

@@ -279,7 +279,7 @@ fn main() -> Result<()> {
config,
)?;
let exit_code = compute_node.run().context("running compute node")?;
let exit_code = compute_node.run()?;
scenario.teardown();

View File

@@ -24,9 +24,9 @@ pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
});
let query = "
INSERT INTO public.health_check VALUES (1, pg_catalog.now())
INSERT INTO health_check VALUES (1, now())
ON CONFLICT (id) DO UPDATE
SET updated_at = pg_catalog.now();";
SET updated_at = now();";
match client.simple_query(query).await {
Result::Ok(result) => {

View File

@@ -32,12 +32,8 @@ use std::sync::{Arc, Condvar, Mutex, RwLock};
use std::time::{Duration, Instant};
use std::{env, fs};
use tokio::{spawn, sync::watch, task::JoinHandle, time};
use tokio_util::sync::CancellationToken;
use tracing::{Instrument, debug, error, info, instrument, warn};
use url::Url;
use utils::backoff::{
DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS, exponential_backoff_duration,
};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use utils::measured_stream::MeasuredReader;
@@ -196,7 +192,6 @@ pub struct ComputeState {
pub startup_span: Option<tracing::span::Span>,
pub lfc_prewarm_state: LfcPrewarmState,
pub lfc_prewarm_token: CancellationToken,
pub lfc_offload_state: LfcOffloadState,
/// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if
@@ -222,7 +217,6 @@ impl ComputeState {
lfc_offload_state: LfcOffloadState::default(),
terminate_flush_lsn: None,
promote_state: None,
lfc_prewarm_token: CancellationToken::new(),
}
}
@@ -589,7 +583,7 @@ impl ComputeNode {
// that can affect `compute_ctl` and prevent it from properly configuring the database schema.
// Unset them via connection string options before connecting to the database.
// N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`.
const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0 -c pgaudit.log=none";
const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0 -c pgaudit.log=none";
let options = match conn_conf.get_options() {
// Allow the control plane to override any options set by the
// compute
@@ -1560,41 +1554,6 @@ impl ComputeNode {
Ok(lsn)
}
fn sync_safekeepers_with_retries(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
let max_retries = 5;
let mut attempts = 0;
loop {
let result = self.sync_safekeepers(storage_auth_token.clone());
match &result {
Ok(_) => {
if attempts > 0 {
tracing::info!("sync_safekeepers succeeded after {attempts} retries");
}
return result;
}
Err(e) if attempts < max_retries => {
tracing::info!(
"sync_safekeepers failed, will retry (attempt {attempts}): {e:#}"
);
}
Err(err) => {
tracing::warn!(
"sync_safekeepers still failed after {attempts} retries, giving up: {err:?}"
);
return result;
}
}
// sleep and retry
let backoff = exponential_backoff_duration(
attempts,
DEFAULT_BASE_BACKOFF_SECONDS,
DEFAULT_MAX_BACKOFF_SECONDS,
);
std::thread::sleep(backoff);
attempts += 1;
}
}
/// Do all the preparations like PGDATA directory creation, configuration,
/// safekeepers sync, basebackup, etc.
#[instrument(skip_all)]
@@ -1630,7 +1589,7 @@ impl ComputeNode {
lsn
} else {
info!("starting safekeepers syncing");
self.sync_safekeepers_with_retries(pspec.storage_auth_token.clone())
self.sync_safekeepers(pspec.storage_auth_token.clone())
.with_context(|| "failed to sync safekeepers")?
};
info!("safekeepers synced at LSN {}", lsn);
@@ -1925,7 +1884,7 @@ impl ComputeNode {
// It doesn't matter what were the options before, here we just want
// to connect and create a new superuser role.
const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0";
const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0";
zenith_admin_conf.options(ZENITH_OPTIONS);
let mut client =
@@ -2380,13 +2339,13 @@ impl ComputeNode {
let result = client
.simple_query(
"SELECT
pg_catalog.row_to_json(pss)
row_to_json(pg_stat_statements)
FROM
public.pg_stat_statements pss
pg_stat_statements
WHERE
pss.userid != 'cloud_admin'::pg_catalog.regrole::pg_catalog.oid
userid != 'cloud_admin'::regrole::oid
ORDER BY
(pss.mean_exec_time + pss.mean_plan_time) DESC
(mean_exec_time + mean_plan_time) DESC
LIMIT 100",
)
.await;
@@ -2514,11 +2473,11 @@ LIMIT 100",
// check the role grants first - to gracefully handle read-replicas.
let select = "SELECT privilege_type
FROM pg_catalog.pg_namespace
JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) AS acl ON true
JOIN pg_catalog.pg_user users ON acl.grantee = users.usesysid
WHERE users.usename OPERATOR(pg_catalog.=) $1::pg_catalog.name
AND nspname OPERATOR(pg_catalog.=) $2::pg_catalog.name";
FROM pg_namespace
JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true
JOIN pg_user users ON acl.grantee = users.usesysid
WHERE users.usename = $1
AND nspname = $2";
let rows = db_client
.query(select, &[role_name, schema_name])
.await
@@ -2587,9 +2546,8 @@ LIMIT 100",
.await
.with_context(|| format!("Failed to execute query: {query}"))?;
} else {
let query = format!(
"CREATE EXTENSION IF NOT EXISTS {ext_name} WITH SCHEMA public VERSION {quoted_version}"
);
let query =
format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}");
db_client
.simple_query(&query)
.await

View File

@@ -7,8 +7,7 @@ use http::StatusCode;
use reqwest::Client;
use std::mem::replace;
use std::sync::Arc;
use tokio::{io::AsyncReadExt, select, spawn};
use tokio_util::sync::CancellationToken;
use tokio::{io::AsyncReadExt, spawn};
use tracing::{error, info};
#[derive(serde::Serialize, Default)]
@@ -93,35 +92,34 @@ impl ComputeNode {
/// If there is a prewarm request ongoing, return `false`, `true` otherwise.
/// Has a failpoint "compute-prewarm"
pub fn prewarm_lfc(self: &Arc<Self>, from_endpoint: Option<String>) -> bool {
let token: CancellationToken;
{
let state = &mut self.state.lock().unwrap();
token = state.lfc_prewarm_token.clone();
if let LfcPrewarmState::Prewarming =
replace(&mut state.lfc_prewarm_state, LfcPrewarmState::Prewarming)
{
let state = &mut self.state.lock().unwrap().lfc_prewarm_state;
if let LfcPrewarmState::Prewarming = replace(state, LfcPrewarmState::Prewarming) {
return false;
}
}
crate::metrics::LFC_PREWARMS.inc();
let this = self.clone();
let cloned = self.clone();
spawn(async move {
let prewarm_state = match this.prewarm_impl(from_endpoint, token).await {
Ok(state) => state,
let state = match cloned.prewarm_impl(from_endpoint).await {
Ok(true) => LfcPrewarmState::Completed,
Ok(false) => {
info!(
"skipping LFC prewarm because LFC state is not found in endpoint storage"
);
LfcPrewarmState::Skipped
}
Err(err) => {
crate::metrics::LFC_PREWARM_ERRORS.inc();
error!(%err, "could not prewarm LFC");
let error = format!("{err:#}");
LfcPrewarmState::Failed { error }
LfcPrewarmState::Failed {
error: format!("{err:#}"),
}
}
};
let state = &mut this.state.lock().unwrap();
if let LfcPrewarmState::Cancelled = prewarm_state {
state.lfc_prewarm_token = CancellationToken::new();
}
state.lfc_prewarm_state = prewarm_state;
cloned.state.lock().unwrap().lfc_prewarm_state = state;
});
true
}
@@ -134,70 +132,47 @@ impl ComputeNode {
/// Request LFC state from endpoint storage and load corresponding pages into Postgres.
/// Returns a result with `false` if the LFC state is not found in endpoint storage.
async fn prewarm_impl(
&self,
from_endpoint: Option<String>,
token: CancellationToken,
) -> Result<LfcPrewarmState> {
let EndpointStoragePair {
url,
token: storage_token,
} = self.endpoint_storage_pair(from_endpoint)?;
async fn prewarm_impl(&self, from_endpoint: Option<String>) -> Result<bool> {
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?;
#[cfg(feature = "testing")]
fail::fail_point!("compute-prewarm", |_| bail!("compute-prewarm failpoint"));
fail::fail_point!("compute-prewarm", |_| {
bail!("prewarm configured to fail because of a failpoint")
});
info!(%url, "requesting LFC state from endpoint storage");
let request = Client::new().get(&url).bearer_auth(storage_token);
let response = select! {
_ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled),
response = request.send() => response
}
.context("querying endpoint storage")?;
match response.status() {
let request = Client::new().get(&url).bearer_auth(token);
let res = request.send().await.context("querying endpoint storage")?;
match res.status() {
StatusCode::OK => (),
StatusCode::NOT_FOUND => return Ok(LfcPrewarmState::Skipped),
StatusCode::NOT_FOUND => {
return Ok(false);
}
status => bail!("{status} querying endpoint storage"),
}
let mut uncompressed = Vec::new();
let lfc_state = select! {
_ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled),
lfc_state = response.bytes() => lfc_state
}
.context("getting request body from endpoint storage")?;
let mut decoder = ZstdDecoder::new(lfc_state.iter().as_slice());
select! {
_ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled),
read = decoder.read_to_end(&mut uncompressed) => read
}
.context("decoding LFC state")?;
let uncompressed_len = uncompressed.len();
info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}");
// Client connection and prewarm info querying are fast and therefore don't need
// cancellation
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
let lfc_state = res
.bytes()
.await
.context("connecting to postgres")?;
let pg_token = client.cancel_token();
.context("getting request body from endpoint storage")?;
ZstdDecoder::new(lfc_state.iter().as_slice())
.read_to_end(&mut uncompressed)
.await
.context("decoding LFC state")?;
let uncompressed_len = uncompressed.len();
let params: Vec<&(dyn postgres_types::ToSql + Sync)> = vec![&uncompressed];
select! {
res = client.query_one("select neon.prewarm_local_cache($1)", &params) => res,
_ = token.cancelled() => {
pg_token.cancel_query(postgres::NoTls).await
.context("cancelling neon.prewarm_local_cache()")?;
return Ok(LfcPrewarmState::Cancelled)
}
}
.context("loading LFC state into postgres")
.map(|_| ())?;
info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into Postgres");
Ok(LfcPrewarmState::Completed)
ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
.await
.context("connecting to postgres")?
.query_one("select neon.prewarm_local_cache($1)", &[&uncompressed])
.await
.context("loading LFC state into postgres")
.map(|_| ())?;
Ok(true)
}
/// If offload request is ongoing, return false, true otherwise
@@ -225,20 +200,20 @@ impl ComputeNode {
async fn offload_lfc_with_state_update(&self) {
crate::metrics::LFC_OFFLOADS.inc();
let state = match self.offload_lfc_impl().await {
Ok(state) => state,
Err(err) => {
crate::metrics::LFC_OFFLOAD_ERRORS.inc();
error!(%err, "could not offload LFC");
let error = format!("{err:#}");
LfcOffloadState::Failed { error }
}
let Err(err) = self.offload_lfc_impl().await else {
self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
return;
};
self.state.lock().unwrap().lfc_offload_state = state;
crate::metrics::LFC_OFFLOAD_ERRORS.inc();
error!(%err, "could not offload LFC state to endpoint storage");
self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
error: format!("{err:#}"),
};
}
async fn offload_lfc_impl(&self) -> Result<LfcOffloadState> {
async fn offload_lfc_impl(&self) -> Result<()> {
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?;
info!(%url, "requesting LFC state from Postgres");
@@ -253,7 +228,7 @@ impl ComputeNode {
.context("deserializing LFC state")?;
let Some(state) = state else {
info!(%url, "empty LFC state, not exporting");
return Ok(LfcOffloadState::Skipped);
return Ok(());
};
let mut compressed = Vec::new();
@@ -267,7 +242,7 @@ impl ComputeNode {
let request = Client::new().put(url).bearer_auth(token).body(compressed);
match request.send().await {
Ok(res) if res.status() == StatusCode::OK => Ok(LfcOffloadState::Completed),
Ok(res) if res.status() == StatusCode::OK => Ok(()),
Ok(res) => bail!(
"Request to endpoint storage failed with status: {}",
res.status()
@@ -275,8 +250,4 @@ impl ComputeNode {
Err(err) => Err(err).context("writing to endpoint storage"),
}
}
pub fn cancel_prewarm(self: &Arc<Self>) {
self.state.lock().unwrap().lfc_prewarm_token.cancel();
}
}

View File

@@ -78,7 +78,7 @@ impl ComputeNode {
const RETRIES: i32 = 20;
for i in 0..=RETRIES {
let row = client
.query_one("SELECT pg_catalog.pg_last_wal_replay_lsn()", &[])
.query_one("SELECT pg_last_wal_replay_lsn()", &[])
.await
.context("getting last replay lsn")?;
let lsn: u64 = row.get::<usize, postgres_types::PgLsn>(0).into();
@@ -103,7 +103,7 @@ impl ComputeNode {
.await
.context("setting safekeepers")?;
client
.query("SELECT pg_catalog.pg_reload_conf()", &[])
.query("SELECT pg_reload_conf()", &[])
.await
.context("reloading postgres config")?;
@@ -113,7 +113,7 @@ impl ComputeNode {
});
let row = client
.query_one("SELECT * FROM pg_catalog.pg_promote()", &[])
.query_one("SELECT * FROM pg_promote()", &[])
.await
.context("pg_promote")?;
if !row.get::<usize, bool>(0) {

View File

@@ -8,7 +8,7 @@ use std::path::Path;
use compute_api::responses::TlsConfig;
use compute_api::spec::{
ComputeAudit, ComputeMode, ComputeSpec, DatabricksSettings, GenericOption, PageserverProtocol,
ComputeAudit, ComputeMode, ComputeSpec, DatabricksSettings, GenericOption,
};
use crate::compute::ComputeNodeParams;
@@ -65,30 +65,15 @@ pub fn write_postgres_conf(
writeln!(file, "{conf}")?;
}
// Stripe size GUC should be defined prior to connection string
if let Some(stripe_size) = spec.shard_stripe_size {
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
// Add options for connecting to storage
writeln!(file, "# Neon storage settings")?;
writeln!(file)?;
if let Some(conninfo) = &spec.pageserver_connection_info {
match conninfo.prefer_protocol {
PageserverProtocol::Libpq => {
writeln!(file, "neon.use_communicator_worker=false")?;
}
PageserverProtocol::Grpc => {
writeln!(file, "neon.use_communicator_worker=true")?;
}
}
// Stripe size GUC should be defined prior to connection string
if let Some(stripe_size) = conninfo.stripe_size {
writeln!(
file,
"# from compute spec's pageserver_connection_info.stripe_size field"
)?;
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
let mut libpq_urls: Option<Vec<String>> = Some(Vec::new());
let mut grpc_urls: Option<Vec<String>> = Some(Vec::new());
let num_shards = if conninfo.shard_count.0 == 0 {
1 // unsharded, treat it as a single shard
} else {
@@ -121,19 +106,11 @@ pub fn write_postgres_conf(
} else {
libpq_urls = None
}
// Similarly for gRPC URLs
if let Some(url) = &first_pageserver.grpc_url {
if let Some(ref mut urls) = grpc_urls {
urls.push(url.clone());
}
} else {
grpc_urls = None
}
}
if let Some(libpq_urls) = libpq_urls {
writeln!(
file,
"# derived from compute spec's pageserver_connection_info field"
"# derived from compute spec's pageserver_conninfo field"
)?;
writeln!(
file,
@@ -143,31 +120,24 @@ pub fn write_postgres_conf(
} else {
writeln!(file, "# no neon.pageserver_connstring")?;
}
if let Some(grpc_urls) = grpc_urls {
writeln!(
file,
"# derived from compute spec's pageserver_conninfo field"
)?;
writeln!(
file,
"neon.pageserver_grpc_urls={}",
escape_conf_value(&grpc_urls.join(","))
)?;
} else {
writeln!(file, "# no neon.pageserver_grpc_urls")?;
}
} else {
writeln!(file, "neon.use_communicator_worker=false")?;
// Stripe size GUC should be defined prior to connection string
if let Some(stripe_size) = spec.shard_stripe_size {
writeln!(file, "# from compute spec's shard_stripe_size field")?;
if let Some(stripe_size) = conninfo.stripe_size {
writeln!(
file,
"# from compute spec's pageserver_conninfo.stripe_size field"
)?;
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
} else {
if let Some(s) = &spec.pageserver_connstring {
writeln!(file, "# from compute spec's pageserver_connstring field")?;
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
}
if let Some(stripe_size) = spec.shard_stripe_size {
writeln!(file, "# from compute spec's shard_stripe_size field")?;
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
}
if !spec.safekeeper_connstrings.is_empty() {

View File

@@ -139,15 +139,6 @@ paths:
application/json:
schema:
$ref: "#/components/schemas/LfcPrewarmState"
delete:
tags:
- Prewarm
summary: Cancel ongoing LFC prewarm
description: ""
operationId: cancelLfcPrewarm
responses:
202:
description: Prewarm cancelled
/lfc/offload:
post:
@@ -645,7 +636,7 @@ components:
properties:
status:
description: LFC offload status
enum: [not_offloaded, offloading, completed, skipped, failed]
enum: [not_offloaded, offloading, completed, failed]
type: string
error:
description: LFC offload error, if any

View File

@@ -46,8 +46,3 @@ pub(in crate::http) async fn offload(compute: Compute) -> Response {
)
}
}
pub(in crate::http) async fn cancel_prewarm(compute: Compute) -> StatusCode {
compute.cancel_prewarm();
StatusCode::ACCEPTED
}

View File

@@ -99,12 +99,7 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
);
let authenticated_router = Router::<Arc<ComputeNode>>::new()
.route(
"/lfc/prewarm",
get(lfc::prewarm_state)
.post(lfc::prewarm)
.delete(lfc::cancel_prewarm),
)
.route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm))
.route("/lfc/offload", get(lfc::offload_state).post(lfc::offload))
.route("/promote", post(promote::promote))
.route("/check_writability", post(check_writability::is_writable))

View File

@@ -19,7 +19,7 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>, PostgresError> {
.query(
"SELECT datname FROM pg_catalog.pg_database
WHERE datallowconn
AND datconnlimit OPERATOR(pg_catalog.<>) (OPERATOR(pg_catalog.-) 2::pg_catalog.int4)
AND datconnlimit <> - 2
LIMIT 500",
&[],
)
@@ -67,7 +67,7 @@ pub async fn get_installed_extensions(
let extensions: Vec<(String, String, i32)> = client
.query(
"SELECT extname, extversion, extowner::pg_catalog.int4 FROM pg_catalog.pg_extension",
"SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension",
&[],
)
.await?

View File

@@ -28,10 +28,7 @@ pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc<ComputeNode>) {
let compute = compute.clone();
let span = tracing::info_span!("lsn_lease_bg_task", %tenant_id, %timeline_id, %lsn);
let runtime = tokio::runtime::Handle::current();
thread::spawn(move || {
let _rt_guard = runtime.enter();
let _entered = span.entered();
if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn) {
// TODO: might need stronger error feedback than logging an warning.

View File

@@ -76,7 +76,7 @@ impl<'m> MigrationRunner<'m> {
self.client
.simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration")
.await?;
self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key pg_catalog.int4 NOT NULL PRIMARY KEY, id pg_catalog.int8 NOT NULL DEFAULT 0)").await?;
self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)").await?;
self.client
.simple_query(
"INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING",

View File

@@ -15,17 +15,17 @@ DO $$
DECLARE
role_name text;
BEGIN
FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member')
FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, '{privileged_role_name}', 'member')
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', pg_catalog.quote_ident(role_name);
EXECUTE pg_catalog.format('ALTER ROLE %I INHERIT;', role_name);
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
END LOOP;
FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles
FOR role_name IN SELECT rolname FROM pg_roles
WHERE
NOT pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT pg_catalog.starts_with(rolname, 'pg_')
NOT pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT starts_with(rolname, 'pg_')
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', pg_catalog.quote_ident(role_name);
EXECUTE pg_catalog.format('ALTER ROLE %I NOBYPASSRLS;', role_name);
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
END LOOP;
END $$;

View File

@@ -1,6 +1,6 @@
DO $$
BEGIN
IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name = 'server_version_num') THEN
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
EXECUTE 'GRANT pg_create_subscription TO {privileged_role_name}';
END IF;
END $$;

View File

@@ -5,9 +5,9 @@ DO $$
DECLARE
role_name TEXT;
BEGIN
FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE rolreplication IS TRUE
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', pg_catalog.quote_ident(role_name);
EXECUTE pg_catalog.format('ALTER ROLE %I NOREPLICATION;', role_name);
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
END LOOP;
END $$;

View File

@@ -1,6 +1,6 @@
DO $$
BEGIN
IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name OPERATOR(pg_catalog.=) 'server_version_num'::pg_catalog.text) THEN
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO {privileged_role_name}';
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO {privileged_role_name}';
END IF;

View File

@@ -2,7 +2,7 @@ DO $$
DECLARE
bypassrls boolean;
BEGIN
SELECT rolbypassrls INTO bypassrls FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser';
SELECT rolbypassrls INTO bypassrls FROM pg_roles WHERE rolname = 'neon_superuser';
IF NOT bypassrls THEN
RAISE EXCEPTION 'neon_superuser cannot bypass RLS';
END IF;

View File

@@ -4,8 +4,8 @@ DECLARE
BEGIN
FOR role IN
SELECT rolname AS name, rolinherit AS inherit
FROM pg_catalog.pg_roles
WHERE pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member')
FROM pg_roles
WHERE pg_has_role(rolname, 'neon_superuser', 'member')
LOOP
IF NOT role.inherit THEN
RAISE EXCEPTION '% cannot inherit', quote_ident(role.name);
@@ -14,12 +14,12 @@ BEGIN
FOR role IN
SELECT rolname AS name, rolbypassrls AS bypassrls
FROM pg_catalog.pg_roles
WHERE NOT pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member')
AND NOT pg_catalog.starts_with(rolname, 'pg_')
FROM pg_roles
WHERE NOT pg_has_role(rolname, 'neon_superuser', 'member')
AND NOT starts_with(rolname, 'pg_')
LOOP
IF role.bypassrls THEN
RAISE EXCEPTION '% can bypass RLS', pg_catalog.quote_ident(role.name);
RAISE EXCEPTION '% can bypass RLS', quote_ident(role.name);
END IF;
END LOOP;
END $$;

View File

@@ -1,10 +1,10 @@
DO $$
BEGIN
IF (SELECT pg_catalog.current_setting('server_version_num')::pg_catalog.numeric < 160000) THEN
IF (SELECT current_setting('server_version_num')::numeric < 160000) THEN
RETURN;
END IF;
IF NOT (SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN
IF NOT (SELECT pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN
RAISE EXCEPTION 'neon_superuser cannot execute pg_create_subscription';
END IF;
END $$;

View File

@@ -2,12 +2,12 @@ DO $$
DECLARE
monitor record;
BEGIN
SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member,
SELECT pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member,
admin_option AS admin
INTO monitor
FROM pg_catalog.pg_auth_members
WHERE roleid = 'pg_monitor'::pg_catalog.regrole
AND member = 'neon_superuser'::pg_catalog.regrole;
FROM pg_auth_members
WHERE roleid = 'pg_monitor'::regrole
AND member = 'neon_superuser'::regrole;
IF monitor IS NULL THEN
RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor';

View File

@@ -2,11 +2,11 @@ DO $$
DECLARE
can_execute boolean;
BEGIN
SELECT pg_catalog.bool_and(pg_catalog.has_function_privilege('neon_superuser', oid, 'execute'))
SELECT bool_and(has_function_privilege('neon_superuser', oid, 'execute'))
INTO can_execute
FROM pg_catalog.pg_proc
FROM pg_proc
WHERE proname IN ('pg_export_snapshot', 'pg_log_standby_snapshot')
AND pronamespace = 'pg_catalog'::pg_catalog.regnamespace;
AND pronamespace = 'pg_catalog'::regnamespace;
IF NOT can_execute THEN
RAISE EXCEPTION 'neon_superuser cannot execute both pg_export_snapshot and pg_log_standby_snapshot';
END IF;

View File

@@ -2,9 +2,9 @@ DO $$
DECLARE
can_execute boolean;
BEGIN
SELECT pg_catalog.has_function_privilege('neon_superuser', oid, 'execute')
SELECT has_function_privilege('neon_superuser', oid, 'execute')
INTO can_execute
FROM pg_catalog.pg_proc
FROM pg_proc
WHERE proname = 'pg_show_replication_origin_status'
AND pronamespace = 'pg_catalog'::regnamespace;
IF NOT can_execute THEN

View File

@@ -2,10 +2,10 @@ DO $$
DECLARE
signal_backend record;
BEGIN
SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
admin_option AS admin
INTO signal_backend
FROM pg_catalog.pg_auth_members
FROM pg_auth_members
WHERE roleid = 'pg_signal_backend'::regrole
AND member = 'neon_superuser'::regrole;

View File

@@ -407,9 +407,9 @@ fn get_database_stats(cli: &mut Client) -> anyhow::Result<(f64, i64)> {
// like `postgres_exporter` use it to query Postgres statistics.
// Use explicit 8 bytes type casts to match Rust types.
let stats = cli.query_one(
"SELECT pg_catalog.coalesce(pg_catalog.sum(active_time), 0.0)::pg_catalog.float8 AS total_active_time,
pg_catalog.coalesce(pg_catalog.sum(sessions), 0)::pg_catalog.bigint AS total_sessions
FROM pg_catalog.pg_stat_database
"SELECT coalesce(sum(active_time), 0.0)::float8 AS total_active_time,
coalesce(sum(sessions), 0)::bigint AS total_sessions
FROM pg_stat_database
WHERE datname NOT IN (
'postgres',
'template0',
@@ -445,11 +445,11 @@ fn get_backends_state_change(cli: &mut Client) -> anyhow::Result<Option<DateTime
let mut last_active: Option<DateTime<Utc>> = None;
// Get all running client backends except ourself, use RFC3339 DateTime format.
let backends = cli.query(
"SELECT state, pg_catalog.to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"'::pg_catalog.text) AS state_change
"SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change
FROM pg_stat_activity
WHERE backend_type OPERATOR(pg_catalog.=) 'client backend'::pg_catalog.text
AND pid OPERATOR(pg_catalog.!=) pg_catalog.pg_backend_pid()
AND usename OPERATOR(pg_catalog.!=) 'cloud_admin'::pg_catalog.name;", // XXX: find a better way to filter other monitors?
WHERE backend_type = 'client backend'
AND pid != pg_backend_pid()
AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
&[],
);

View File

@@ -299,9 +299,9 @@ pub async fn get_existing_dbs_async(
.query_raw::<str, &String, &[String; 0]>(
"SELECT
datname AS name,
(SELECT rolname FROM pg_catalog.pg_roles WHERE oid OPERATOR(pg_catalog.=) datdba) AS owner,
(SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner,
NOT datallowconn AS restrict_conn,
datconnlimit OPERATOR(pg_catalog.=) (OPERATOR(pg_catalog.-) 2) AS invalid
datconnlimit = - 2 AS invalid
FROM
pg_catalog.pg_database;",
&[],

View File

@@ -82,7 +82,7 @@ impl ComputeNode {
info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id);
drop_subscriptions_done = match
client.query("select 1 from neon.drop_subscriptions_done where timeline_id OPERATOR(pg_catalog.=) $1", &[&timeline_id.to_string()]).await {
client.query("select 1 from neon.drop_subscriptions_done where timeline_id = $1", &[&timeline_id.to_string()]).await {
Ok(result) => !result.is_empty(),
Err(e) =>
{
@@ -1142,9 +1142,7 @@ async fn get_operations<'a>(
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
if libs.contains("pg_stat_statements") {
return Ok(Box::new(once(Operation {
query: String::from(
"CREATE EXTENSION IF NOT EXISTS pg_stat_statements WITH SCHEMA public",
),
query: String::from("CREATE EXTENSION IF NOT EXISTS pg_stat_statements"),
comment: Some(String::from("create system extensions")),
})));
}
@@ -1152,13 +1150,11 @@ async fn get_operations<'a>(
Ok(Box::new(empty()))
}
ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation {
query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit WITH SCHEMA public"),
query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"),
comment: Some(String::from("create pgaudit extensions")),
}))),
ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation {
query: String::from(
"CREATE EXTENSION IF NOT EXISTS pgauditlogtofile WITH SCHEMA public",
),
query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"),
comment: Some(String::from("create pgauditlogtofile extensions")),
}))),
// Disable pgaudit logging for postgres database.
@@ -1182,7 +1178,7 @@ async fn get_operations<'a>(
},
Operation {
query: String::from(
"UPDATE pg_catalog.pg_extension SET extrelocatable = true WHERE extname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name AND extrelocatable OPERATOR(pg_catalog.=) false",
"UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'",
),
comment: Some(String::from("compat/fix: make neon relocatable")),
},

View File

@@ -3,17 +3,16 @@ BEGIN
IF NOT EXISTS(
SELECT 1
FROM pg_catalog.pg_tables
WHERE tablename::pg_catalog.name OPERATOR(pg_catalog.=) 'health_check'::pg_catalog.name
AND schemaname::pg_catalog.name OPERATOR(pg_catalog.=) 'public'::pg_catalog.name
WHERE tablename = 'health_check'
)
THEN
CREATE TABLE public.health_check (
id pg_catalog.int4 primary key generated by default as identity,
updated_at pg_catalog.timestamptz default pg_catalog.now()
CREATE TABLE health_check (
id serial primary key,
updated_at timestamptz default now()
);
INSERT INTO public.health_check VALUES (1, pg_catalog.now())
INSERT INTO health_check VALUES (1, now())
ON CONFLICT (id) DO UPDATE
SET updated_at = pg_catalog.now();
SET updated_at = now();
END IF;
END
$$

View File

@@ -0,0 +1,12 @@
DO $$
DECLARE
query varchar;
BEGIN
FOR query IN SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {db_owner};'
FROM pg_proc p
JOIN pg_namespace nsp ON p.pronamespace = nsp.oid
WHERE nsp.nspname = 'anon' LOOP
EXECUTE query;
END LOOP;
END
$$;

View File

@@ -1,6 +1,6 @@
DO $$
BEGIN
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname OPERATOR(pg_catalog.=) '{privileged_role_name}'::pg_catalog.name)
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{privileged_role_name}')
THEN
CREATE ROLE {privileged_role_name} {privileges} IN ROLE pg_read_all_data, pg_write_all_data;
END IF;

View File

@@ -4,14 +4,14 @@ $$
IF EXISTS(
SELECT nspname
FROM pg_catalog.pg_namespace
WHERE nspname OPERATOR(pg_catalog.=) 'public'
WHERE nspname = 'public'
) AND
pg_catalog.current_setting('server_version_num')::int OPERATOR(pg_catalog./) 10000 OPERATOR(pg_catalog.>=) 15
current_setting('server_version_num')::int / 10000 >= 15
THEN
IF EXISTS(
SELECT rolname
FROM pg_catalog.pg_roles
WHERE rolname OPERATOR(pg_catalog.=) 'web_access'
WHERE rolname = 'web_access'
)
THEN
GRANT CREATE ON SCHEMA public TO web_access;
@@ -20,7 +20,7 @@ $$
IF EXISTS(
SELECT nspname
FROM pg_catalog.pg_namespace
WHERE nspname OPERATOR(pg_catalog.=) 'public'
WHERE nspname = 'public'
)
THEN
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;

View File

@@ -2,17 +2,11 @@ DO ${outer_tag}$
DECLARE
subname TEXT;
BEGIN
LOCK TABLE pg_catalog.pg_subscription IN ACCESS EXCLUSIVE MODE;
FOR subname IN
SELECT pg_subscription.subname
FROM pg_catalog.pg_subscription
WHERE subdbid OPERATOR(pg_catalog.=) (
SELECT oid FROM pg_database WHERE datname OPERATOR(pg_catalog.=) {datname_str}::pg_catalog.name
)
LOOP
EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I DISABLE;', subname);
EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
EXECUTE pg_catalog.format('DROP SUBSCRIPTION %I;', subname);
LOCK TABLE pg_subscription IN ACCESS EXCLUSIVE MODE;
FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP
EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname);
EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
EXECUTE format('DROP SUBSCRIPTION %I;', subname);
END LOOP;
END;
${outer_tag}$;

View File

@@ -3,19 +3,19 @@ BEGIN
IF NOT EXISTS(
SELECT 1
FROM pg_catalog.pg_tables
WHERE tablename OPERATOR(pg_catalog.=) 'drop_subscriptions_done'::pg_catalog.name
AND schemaname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name
WHERE tablename = 'drop_subscriptions_done'
AND schemaname = 'neon'
)
THEN
CREATE TABLE neon.drop_subscriptions_done
(id pg_catalog.int4 primary key generated by default as identity, timeline_id pg_catalog.text);
(id serial primary key, timeline_id text);
END IF;
-- preserve the timeline_id of the last drop_subscriptions run
-- to ensure that the cleanup of a timeline is executed only once.
-- use upsert to avoid the table bloat in case of cascade branching (branch of a branch)
INSERT INTO neon.drop_subscriptions_done VALUES (1, pg_catalog.current_setting('neon.timeline_id'))
INSERT INTO neon.drop_subscriptions_done VALUES (1, current_setting('neon.timeline_id'))
ON CONFLICT (id) DO UPDATE
SET timeline_id = pg_catalog.current_setting('neon.timeline_id')::pg_catalog.text;
SET timeline_id = current_setting('neon.timeline_id');
END
$$

View File

@@ -15,15 +15,15 @@ BEGIN
WHERE schema_name IN ('public')
LOOP
FOR grantor IN EXECUTE
pg_catalog.format(
'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee OPERATOR(pg_catalog.=) %s',
format(
'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee = %s',
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
quote_literal({role_name})
)
LOOP
EXECUTE pg_catalog.format('SET LOCAL ROLE %I', grantor);
EXECUTE format('SET LOCAL ROLE %I', grantor);
revoke_query := pg_catalog.format(
revoke_query := format(
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY %I',
schema,
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`

View File

@@ -5,17 +5,17 @@ DO ${outer_tag}$
IF EXISTS(
SELECT nspname
FROM pg_catalog.pg_namespace
WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.name
WHERE nspname = 'public'
)
THEN
SELECT nspowner::regrole::text
FROM pg_catalog.pg_namespace
WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.text
WHERE nspname = 'public'
INTO schema_owner;
IF schema_owner OPERATOR(pg_catalog.=) 'cloud_admin'::pg_catalog.text OR schema_owner OPERATOR(pg_catalog.=) 'zenith_admin'::pg_catalog.text
IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'
THEN
EXECUTE pg_catalog.format('ALTER SCHEMA public OWNER TO %I', {db_owner});
EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner});
END IF;
END IF;
END

View File

@@ -3,10 +3,10 @@ DO ${outer_tag}$
IF EXISTS(
SELECT 1
FROM pg_catalog.pg_database
WHERE datname OPERATOR(pg_catalog.=) {datname}::pg_catalog.name
WHERE datname = {datname}
)
THEN
EXECUTE pg_catalog.format('ALTER DATABASE %I is_template false', {datname});
EXECUTE format('ALTER DATABASE %I is_template false', {datname});
END IF;
END
${outer_tag}$;

View File

@@ -71,9 +71,8 @@ const DEFAULT_PG_VERSION_NUM: &str = "17";
const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";
/// Neon CLI.
#[derive(clap::Parser)]
#[command(version = GIT_VERSION, name = "Neon CLI")]
#[command(version = GIT_VERSION, about, name = "Neon CLI")]
struct Cli {
#[command(subcommand)]
command: NeonLocalCmd,
@@ -108,31 +107,30 @@ enum NeonLocalCmd {
Stop(StopCmdArgs),
}
/// Initialize a new Neon repository, preparing configs for services to start with.
#[derive(clap::Args)]
#[clap(about = "Initialize a new Neon repository, preparing configs for services to start with")]
struct InitCmdArgs {
/// How many pageservers to create (default 1).
#[clap(long)]
#[clap(long, help("How many pageservers to create (default 1)"))]
num_pageservers: Option<u16>,
#[clap(long)]
config: Option<PathBuf>,
/// Force initialization even if the repository is not empty.
#[clap(long, default_value = "must-not-exist")]
#[clap(long, help("Force initialization even if the repository is not empty"))]
#[arg(value_parser)]
#[clap(default_value = "must-not-exist")]
force: InitForceMode,
}
/// Start pageserver and safekeepers.
#[derive(clap::Args)]
#[clap(about = "Start pageserver and safekeepers")]
struct StartCmdArgs {
#[clap(long = "start-timeout", default_value = "10s")]
timeout: humantime::Duration,
}
/// Stop pageserver and safekeepers.
#[derive(clap::Args)]
#[clap(about = "Stop pageserver and safekeepers")]
struct StopCmdArgs {
#[arg(value_enum)]
#[clap(long, default_value_t = StopMode::Fast)]
@@ -145,8 +143,8 @@ enum StopMode {
Immediate,
}
/// Manage tenants.
#[derive(clap::Subcommand)]
#[clap(about = "Manage tenants")]
enum TenantCmd {
List,
Create(TenantCreateCmdArgs),
@@ -157,36 +155,38 @@ enum TenantCmd {
#[derive(clap::Args)]
struct TenantCreateCmdArgs {
/// Tenant ID, as a 32-byte hexadecimal string.
#[clap(long = "tenant-id")]
#[clap(
long = "tenant-id",
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
)]
tenant_id: Option<TenantId>,
/// Use a specific timeline id when creating a tenant and its initial timeline.
#[clap(long)]
#[clap(
long,
help = "Use a specific timeline id when creating a tenant and its initial timeline"
)]
timeline_id: Option<TimelineId>,
#[clap(short = 'c')]
config: Vec<String>,
/// Postgres version to use for the initial timeline.
#[arg(default_value = DEFAULT_PG_VERSION_NUM)]
#[clap(long)]
#[clap(long, help = "Postgres version to use for the initial timeline")]
pg_version: PgMajorVersion,
/// Use this tenant in future CLI commands where tenant_id is needed, but not specified.
#[clap(long)]
#[clap(
long,
help = "Use this tenant in future CLI commands where tenant_id is needed, but not specified"
)]
set_default: bool,
/// Number of shards in the new tenant.
#[clap(long)]
#[clap(long, help = "Number of shards in the new tenant")]
#[arg(default_value_t = 0)]
shard_count: u8,
/// Sharding stripe size in pages.
#[clap(long)]
#[clap(long, help = "Sharding stripe size in pages")]
shard_stripe_size: Option<u32>,
/// Placement policy shards in this tenant.
#[clap(long)]
#[clap(long, help = "Placement policy shards in this tenant")]
#[arg(value_parser = parse_placement_policy)]
placement_policy: Option<PlacementPolicy>,
}
@@ -195,35 +195,44 @@ fn parse_placement_policy(s: &str) -> anyhow::Result<PlacementPolicy> {
Ok(serde_json::from_str::<PlacementPolicy>(s)?)
}
/// Set a particular tenant as default in future CLI commands where tenant_id is needed, but not
/// specified.
#[derive(clap::Args)]
#[clap(
about = "Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"
)]
struct TenantSetDefaultCmdArgs {
/// Tenant ID, as a 32-byte hexadecimal string.
#[clap(long = "tenant-id")]
#[clap(
long = "tenant-id",
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
)]
tenant_id: TenantId,
}
#[derive(clap::Args)]
struct TenantConfigCmdArgs {
/// Tenant ID, as a 32-byte hexadecimal string.
#[clap(long = "tenant-id")]
#[clap(
long = "tenant-id",
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
)]
tenant_id: Option<TenantId>,
#[clap(short = 'c')]
config: Vec<String>,
}
/// Import a tenant that is present in remote storage, and create branches for its timelines.
#[derive(clap::Args)]
#[clap(
about = "Import a tenant that is present in remote storage, and create branches for its timelines"
)]
struct TenantImportCmdArgs {
/// Tenant ID, as a 32-byte hexadecimal string.
#[clap(long = "tenant-id")]
#[clap(
long = "tenant-id",
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
)]
tenant_id: TenantId,
}
/// Manage timelines.
#[derive(clap::Subcommand)]
#[clap(about = "Manage timelines")]
enum TimelineCmd {
List(TimelineListCmdArgs),
Branch(TimelineBranchCmdArgs),
@@ -231,87 +240,98 @@ enum TimelineCmd {
Import(TimelineImportCmdArgs),
}
/// List all timelines available to this pageserver.
#[derive(clap::Args)]
#[clap(about = "List all timelines available to this pageserver")]
struct TimelineListCmdArgs {
/// Tenant ID, as a 32-byte hexadecimal string.
#[clap(long = "tenant-id")]
#[clap(
long = "tenant-id",
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
)]
tenant_shard_id: Option<TenantShardId>,
}
/// Create a new timeline, branching off from another timeline.
#[derive(clap::Args)]
#[clap(about = "Create a new timeline, branching off from another timeline")]
struct TimelineBranchCmdArgs {
/// Tenant ID, as a 32-byte hexadecimal string.
#[clap(long = "tenant-id")]
#[clap(
long = "tenant-id",
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
)]
tenant_id: Option<TenantId>,
/// New timeline's ID, as a 32-byte hexadecimal string.
#[clap(long)]
#[clap(long, help = "New timeline's ID")]
timeline_id: Option<TimelineId>,
/// Human-readable alias for the new timeline.
#[clap(long)]
#[clap(long, help = "Human-readable alias for the new timeline")]
branch_name: String,
/// Use last Lsn of another timeline (and its data) as base when creating the new timeline. The
/// timeline gets resolved by its branch name.
#[clap(long)]
#[clap(
long,
help = "Use last Lsn of another timeline (and its data) as base when creating the new timeline. The timeline gets resolved by its branch name."
)]
ancestor_branch_name: Option<String>,
/// When using another timeline as base, use a specific Lsn in it instead of the latest one.
#[clap(long)]
#[clap(
long,
help = "When using another timeline as base, use a specific Lsn in it instead of the latest one"
)]
ancestor_start_lsn: Option<Lsn>,
}
/// Create a new blank timeline.
#[derive(clap::Args)]
#[clap(about = "Create a new blank timeline")]
struct TimelineCreateCmdArgs {
/// Tenant ID, as a 32-byte hexadecimal string.
#[clap(long = "tenant-id")]
#[clap(
long = "tenant-id",
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
)]
tenant_id: Option<TenantId>,
/// New timeline's ID, as a 32-byte hexadecimal string.
#[clap(long)]
#[clap(long, help = "New timeline's ID")]
timeline_id: Option<TimelineId>,
/// Human-readable alias for the new timeline.
#[clap(long)]
#[clap(long, help = "Human-readable alias for the new timeline")]
branch_name: String,
/// Postgres version.
#[arg(default_value = DEFAULT_PG_VERSION_NUM)]
#[clap(long)]
#[clap(long, help = "Postgres version")]
pg_version: PgMajorVersion,
}
/// Import a timeline from a basebackup directory.
#[derive(clap::Args)]
#[clap(about = "Import timeline from a basebackup directory")]
struct TimelineImportCmdArgs {
/// Tenant ID, as a 32-byte hexadecimal string.
#[clap(long = "tenant-id")]
#[clap(
long = "tenant-id",
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
)]
tenant_id: Option<TenantId>,
/// New timeline's ID, as a 32-byte hexadecimal string.
#[clap(long)]
#[clap(long, help = "New timeline's ID")]
timeline_id: TimelineId,
/// Human-readable alias for the new timeline.
#[clap(long)]
#[clap(long, help = "Human-readable alias for the new timeline")]
branch_name: String,
/// Basebackup tarfile to import.
#[clap(long)]
#[clap(long, help = "Basebackup tarfile to import")]
base_tarfile: PathBuf,
/// LSN the basebackup starts at.
#[clap(long)]
#[clap(long, help = "Lsn the basebackup starts at")]
base_lsn: Lsn,
/// WAL to add after base.
#[clap(long)]
#[clap(long, help = "Wal to add after base")]
wal_tarfile: Option<PathBuf>,
/// LSN the basebackup ends at.
#[clap(long)]
#[clap(long, help = "Lsn the basebackup ends at")]
end_lsn: Option<Lsn>,
/// Postgres version of the basebackup being imported.
#[arg(default_value = DEFAULT_PG_VERSION_NUM)]
#[clap(long)]
#[clap(long, help = "Postgres version of the backup being imported")]
pg_version: PgMajorVersion,
}
/// Manage pageservers.
#[derive(clap::Subcommand)]
#[clap(about = "Manage pageservers")]
enum PageserverCmd {
Status(PageserverStatusCmdArgs),
Start(PageserverStartCmdArgs),
@@ -319,202 +339,223 @@ enum PageserverCmd {
Restart(PageserverRestartCmdArgs),
}
/// Show status of a local pageserver.
#[derive(clap::Args)]
#[clap(about = "Show status of a local pageserver")]
struct PageserverStatusCmdArgs {
/// Pageserver ID.
#[clap(long = "id")]
#[clap(long = "id", help = "pageserver id")]
pageserver_id: Option<NodeId>,
}
/// Start local pageserver.
#[derive(clap::Args)]
#[clap(about = "Start local pageserver")]
struct PageserverStartCmdArgs {
/// Pageserver ID.
#[clap(long = "id")]
#[clap(long = "id", help = "pageserver id")]
pageserver_id: Option<NodeId>,
/// Timeout until we fail the command.
#[clap(short = 't', long)]
#[clap(short = 't', long, help = "timeout until we fail the command")]
#[arg(default_value = "10s")]
start_timeout: humantime::Duration,
}
/// Stop local pageserver.
#[derive(clap::Args)]
#[clap(about = "Stop local pageserver")]
struct PageserverStopCmdArgs {
/// Pageserver ID.
#[clap(long = "id")]
#[clap(long = "id", help = "pageserver id")]
pageserver_id: Option<NodeId>,
/// If 'immediate', don't flush repository data at shutdown
#[clap(short = 'm')]
#[clap(
short = 'm',
help = "If 'immediate', don't flush repository data at shutdown"
)]
#[arg(value_enum, default_value = "fast")]
stop_mode: StopMode,
}
/// Restart local pageserver.
#[derive(clap::Args)]
#[clap(about = "Restart local pageserver")]
struct PageserverRestartCmdArgs {
/// Pageserver ID.
#[clap(long = "id")]
#[clap(long = "id", help = "pageserver id")]
pageserver_id: Option<NodeId>,
/// Timeout until we fail the command.
#[clap(short = 't', long)]
#[clap(short = 't', long, help = "timeout until we fail the command")]
#[arg(default_value = "10s")]
start_timeout: humantime::Duration,
}
/// Manage storage controller.
#[derive(clap::Subcommand)]
#[clap(about = "Manage storage controller")]
enum StorageControllerCmd {
Start(StorageControllerStartCmdArgs),
Stop(StorageControllerStopCmdArgs),
}
/// Start storage controller.
#[derive(clap::Args)]
#[clap(about = "Start storage controller")]
struct StorageControllerStartCmdArgs {
/// Timeout until we fail the command.
#[clap(short = 't', long)]
#[clap(short = 't', long, help = "timeout until we fail the command")]
#[arg(default_value = "10s")]
start_timeout: humantime::Duration,
/// Identifier used to distinguish storage controller instances.
#[clap(long)]
#[clap(
long,
help = "Identifier used to distinguish storage controller instances"
)]
#[arg(default_value_t = 1)]
instance_id: u8,
/// Base port for the storage controller instance identified by instance-id (defaults to
/// pageserver cplane api).
#[clap(long)]
#[clap(
long,
help = "Base port for the storage controller instance idenfified by instance-id (defaults to pageserver cplane api)"
)]
base_port: Option<u16>,
/// Whether the storage controller should handle pageserver-reported local disk loss events.
#[clap(long)]
#[clap(
long,
help = "Whether the storage controller should handle pageserver-reported local disk loss events."
)]
handle_ps_local_disk_loss: Option<bool>,
}
/// Stop storage controller.
#[derive(clap::Args)]
#[clap(about = "Stop storage controller")]
struct StorageControllerStopCmdArgs {
/// If 'immediate', don't flush repository data at shutdown
#[clap(short = 'm')]
#[clap(
short = 'm',
help = "If 'immediate', don't flush repository data at shutdown"
)]
#[arg(value_enum, default_value = "fast")]
stop_mode: StopMode,
/// Identifier used to distinguish storage controller instances.
#[clap(long)]
#[clap(
long,
help = "Identifier used to distinguish storage controller instances"
)]
#[arg(default_value_t = 1)]
instance_id: u8,
}
/// Manage storage broker.
#[derive(clap::Subcommand)]
#[clap(about = "Manage storage broker")]
enum StorageBrokerCmd {
Start(StorageBrokerStartCmdArgs),
Stop(StorageBrokerStopCmdArgs),
}
/// Start broker.
#[derive(clap::Args)]
#[clap(about = "Start broker")]
struct StorageBrokerStartCmdArgs {
/// Timeout until we fail the command.
#[clap(short = 't', long, default_value = "10s")]
#[clap(short = 't', long, help = "timeout until we fail the command")]
#[arg(default_value = "10s")]
start_timeout: humantime::Duration,
}
/// Stop broker.
#[derive(clap::Args)]
#[clap(about = "stop broker")]
struct StorageBrokerStopCmdArgs {
/// If 'immediate', don't flush repository data on shutdown.
#[clap(short = 'm')]
#[clap(
short = 'm',
help = "If 'immediate', don't flush repository data at shutdown"
)]
#[arg(value_enum, default_value = "fast")]
stop_mode: StopMode,
}
/// Manage safekeepers.
#[derive(clap::Subcommand)]
#[clap(about = "Manage safekeepers")]
enum SafekeeperCmd {
Start(SafekeeperStartCmdArgs),
Stop(SafekeeperStopCmdArgs),
Restart(SafekeeperRestartCmdArgs),
}
/// Manage object storage.
#[derive(clap::Subcommand)]
#[clap(about = "Manage object storage")]
enum EndpointStorageCmd {
Start(EndpointStorageStartCmd),
Stop(EndpointStorageStopCmd),
}
/// Start object storage.
#[derive(clap::Args)]
#[clap(about = "Start object storage")]
struct EndpointStorageStartCmd {
/// Timeout until we fail the command.
#[clap(short = 't', long)]
#[clap(short = 't', long, help = "timeout until we fail the command")]
#[arg(default_value = "10s")]
start_timeout: humantime::Duration,
}
/// Stop object storage.
#[derive(clap::Args)]
#[clap(about = "Stop object storage")]
struct EndpointStorageStopCmd {
/// If 'immediate', don't flush repository data on shutdown.
#[clap(short = 'm')]
#[arg(value_enum, default_value = "fast")]
#[clap(
short = 'm',
help = "If 'immediate', don't flush repository data at shutdown"
)]
stop_mode: StopMode,
}
/// Start local safekeeper.
#[derive(clap::Args)]
#[clap(about = "Start local safekeeper")]
struct SafekeeperStartCmdArgs {
/// Safekeeper ID.
#[clap(help = "safekeeper id")]
#[arg(default_value_t = NodeId(1))]
id: NodeId,
/// Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo.
#[clap(short = 'e', long = "safekeeper-extra-opt")]
#[clap(
short = 'e',
long = "safekeeper-extra-opt",
help = "Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo"
)]
extra_opt: Vec<String>,
/// Timeout until we fail the command.
#[clap(short = 't', long)]
#[clap(short = 't', long, help = "timeout until we fail the command")]
#[arg(default_value = "10s")]
start_timeout: humantime::Duration,
}
/// Stop local safekeeper.
#[derive(clap::Args)]
#[clap(about = "Stop local safekeeper")]
struct SafekeeperStopCmdArgs {
/// Safekeeper ID.
#[clap(help = "safekeeper id")]
#[arg(default_value_t = NodeId(1))]
id: NodeId,
/// If 'immediate', don't flush repository data on shutdown.
#[arg(value_enum, default_value = "fast")]
#[clap(short = 'm')]
#[clap(
short = 'm',
help = "If 'immediate', don't flush repository data at shutdown"
)]
stop_mode: StopMode,
}
/// Restart local safekeeper.
#[derive(clap::Args)]
#[clap(about = "Restart local safekeeper")]
struct SafekeeperRestartCmdArgs {
/// Safekeeper ID.
#[clap(help = "safekeeper id")]
#[arg(default_value_t = NodeId(1))]
id: NodeId,
/// If 'immediate', don't flush repository data on shutdown.
#[arg(value_enum, default_value = "fast")]
#[clap(short = 'm')]
#[clap(
short = 'm',
help = "If 'immediate', don't flush repository data at shutdown"
)]
stop_mode: StopMode,
/// Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo.
#[clap(short = 'e', long = "safekeeper-extra-opt")]
#[clap(
short = 'e',
long = "safekeeper-extra-opt",
help = "Additional safekeeper invocation options, e.g. -e=--http-auth-public-key-path=foo"
)]
extra_opt: Vec<String>,
/// Timeout until we fail the command.
#[clap(short = 't', long)]
#[clap(short = 't', long, help = "timeout until we fail the command")]
#[arg(default_value = "10s")]
start_timeout: humantime::Duration,
}
/// Manage Postgres instances.
#[derive(clap::Subcommand)]
#[clap(about = "Manage Postgres instances")]
enum EndpointCmd {
List(EndpointListCmdArgs),
Create(EndpointCreateCmdArgs),
@@ -526,27 +567,33 @@ enum EndpointCmd {
GenerateJwt(EndpointGenerateJwtCmdArgs),
}
/// List endpoints.
#[derive(clap::Args)]
#[clap(about = "List endpoints")]
struct EndpointListCmdArgs {
/// Tenant ID, as a 32-byte hexadecimal string.
#[clap(long = "tenant-id")]
#[clap(
long = "tenant-id",
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
)]
tenant_shard_id: Option<TenantShardId>,
}
/// Create a compute endpoint.
#[derive(clap::Args)]
#[clap(about = "Create a compute endpoint")]
struct EndpointCreateCmdArgs {
/// Tenant ID, as a 32-byte hexadecimal string.
#[clap(long = "tenant-id")]
#[clap(
long = "tenant-id",
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
)]
tenant_id: Option<TenantId>,
/// Postgres endpoint ID.
#[clap(help = "Postgres endpoint id")]
endpoint_id: Option<String>,
/// Name of the branch the endpoint will run on.
#[clap(long)]
#[clap(long, help = "Name of the branch the endpoint will run on")]
branch_name: Option<String>,
/// Specify LSN on the timeline to start from. By default, end of the timeline would be used.
#[clap(long)]
#[clap(
long,
help = "Specify Lsn on the timeline to start from. By default, end of the timeline would be used"
)]
lsn: Option<Lsn>,
#[clap(long)]
pg_port: Option<u16>,
@@ -557,13 +604,16 @@ struct EndpointCreateCmdArgs {
#[clap(long = "pageserver-id")]
endpoint_pageserver_id: Option<NodeId>,
/// Don't do basebackup, create endpoint directory with only config files.
#[clap(long, action = clap::ArgAction::Set, default_value_t = false)]
#[clap(
long,
help = "Don't do basebackup, create endpoint directory with only config files",
action = clap::ArgAction::Set,
default_value_t = false
)]
config_only: bool,
/// Postgres version.
#[arg(default_value = DEFAULT_PG_VERSION_NUM)]
#[clap(long)]
#[clap(long, help = "Postgres version")]
pg_version: PgMajorVersion,
/// Use gRPC to communicate with Pageservers, by generating grpc:// connstrings.
@@ -574,140 +624,170 @@ struct EndpointCreateCmdArgs {
#[clap(long)]
grpc: bool,
/// If set, the node will be a hot replica on the specified timeline.
#[clap(long, action = clap::ArgAction::Set, default_value_t = false)]
#[clap(
long,
help = "If set, the node will be a hot replica on the specified timeline",
action = clap::ArgAction::Set,
default_value_t = false
)]
hot_standby: bool,
/// If set, will set up the catalog for neon_superuser.
#[clap(long)]
#[clap(long, help = "If set, will set up the catalog for neon_superuser")]
update_catalog: bool,
/// Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but
/// useful for tests.
#[clap(long)]
#[clap(
long,
help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests."
)]
allow_multiple: bool,
/// Name of the privileged role for the endpoint.
// Only allow changing it on creation.
#[clap(long)]
/// Only allow changing it on creation
#[clap(long, help = "Name of the privileged role for the endpoint")]
privileged_role_name: Option<String>,
}
/// Start Postgres. If the endpoint doesn't exist yet, it is created.
#[derive(clap::Args)]
#[clap(about = "Start postgres. If the endpoint doesn't exist yet, it is created.")]
struct EndpointStartCmdArgs {
/// Postgres endpoint ID.
#[clap(help = "Postgres endpoint id")]
endpoint_id: String,
/// Pageserver ID.
#[clap(long = "pageserver-id")]
endpoint_pageserver_id: Option<NodeId>,
/// Safekeepers membership generation to prefix neon.safekeepers with.
#[clap(long)]
#[clap(
long,
help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations."
)]
safekeepers_generation: Option<u32>,
/// List of safekeepers endpoint will talk to.
#[clap(long)]
#[clap(
long,
help = "List of safekeepers endpoint will talk to. Normally neon_local chooses them on its own, but this option allows to override."
)]
safekeepers: Option<String>,
/// Configure the remote extensions storage proxy gateway URL to request for extensions.
#[clap(long, alias = "remote-ext-config")]
#[clap(
long,
help = "Configure the remote extensions storage proxy gateway URL to request for extensions.",
alias = "remote-ext-config"
)]
remote_ext_base_url: Option<String>,
/// If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`
#[clap(long)]
#[clap(
long,
help = "If set, will create test user `user` and `neondb` database. Requires `update-catalog = true`"
)]
create_test_user: bool,
/// Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but
/// useful for tests.
#[clap(long)]
#[clap(
long,
help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests."
)]
allow_multiple: bool,
/// Timeout until we fail the command.
#[clap(short = 't', long, value_parser= humantime::parse_duration)]
#[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")]
#[arg(default_value = "90s")]
start_timeout: Duration,
/// Download LFC cache from endpoint storage on endpoint startup
#[clap(long, default_value = "false")]
#[clap(
long,
help = "Download LFC cache from endpoint storage on endpoint startup",
default_value = "false"
)]
autoprewarm: bool,
/// Upload LFC cache to endpoint storage periodically
#[clap(long)]
#[clap(long, help = "Upload LFC cache to endpoint storage periodically")]
offload_lfc_interval_seconds: Option<std::num::NonZeroU64>,
/// Run in development mode, skipping VM-specific operations like process termination
#[clap(long, action = clap::ArgAction::SetTrue)]
#[clap(
long,
help = "Run in development mode, skipping VM-specific operations like process termination",
action = clap::ArgAction::SetTrue
)]
dev: bool,
}
/// Reconfigure an endpoint.
#[derive(clap::Args)]
#[clap(about = "Reconfigure an endpoint")]
struct EndpointReconfigureCmdArgs {
/// Tenant id. Represented as a hexadecimal string 32 symbols length
#[clap(long = "tenant-id")]
#[clap(
long = "tenant-id",
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
)]
tenant_id: Option<TenantId>,
/// Postgres endpoint ID.
#[clap(help = "Postgres endpoint id")]
endpoint_id: String,
/// Pageserver ID.
#[clap(long = "pageserver-id")]
endpoint_pageserver_id: Option<NodeId>,
#[clap(long)]
safekeepers: Option<String>,
}
/// Refresh the endpoint's configuration by forcing it reload it's spec
#[derive(clap::Args)]
#[clap(about = "Refresh the endpoint's configuration by forcing it reload it's spec")]
struct EndpointRefreshConfigurationArgs {
/// Postgres endpoint id
#[clap(help = "Postgres endpoint id")]
endpoint_id: String,
}
/// Stop an endpoint.
#[derive(clap::Args)]
#[clap(about = "Stop an endpoint")]
struct EndpointStopCmdArgs {
/// Postgres endpoint ID.
#[clap(help = "Postgres endpoint id")]
endpoint_id: String,
/// Also delete data directory (now optional, should be default in future).
#[clap(long)]
#[clap(
long,
help = "Also delete data directory (now optional, should be default in future)"
)]
destroy: bool,
/// Postgres shutdown mode, passed to `pg_ctl -m <mode>`.
#[clap(long)]
#[clap(long, help = "Postgres shutdown mode")]
#[clap(default_value = "fast")]
mode: EndpointTerminateMode,
}
/// Update the pageservers in the spec file of the compute endpoint
#[derive(clap::Args)]
#[clap(about = "Update the pageservers in the spec file of the compute endpoint")]
struct EndpointUpdatePageserversCmdArgs {
/// Postgres endpoint id
#[clap(help = "Postgres endpoint id")]
endpoint_id: String,
/// Specified pageserver id
#[clap(short = 'p', long)]
#[clap(short = 'p', long, help = "Specified pageserver id")]
pageserver_id: Option<NodeId>,
}
/// Generate a JWT for an endpoint.
#[derive(clap::Args)]
#[clap(about = "Generate a JWT for an endpoint")]
struct EndpointGenerateJwtCmdArgs {
/// Postgres endpoint ID.
#[clap(help = "Postgres endpoint id")]
endpoint_id: String,
/// Scope to generate the JWT with.
#[clap(short = 's', long, value_parser = ComputeClaimsScope::from_str)]
#[clap(short = 's', long, help = "Scope to generate the JWT with", value_parser = ComputeClaimsScope::from_str)]
scope: Option<ComputeClaimsScope>,
}
/// Manage neon_local branch name mappings.
#[derive(clap::Subcommand)]
#[clap(about = "Manage neon_local branch name mappings")]
enum MappingsCmd {
Map(MappingsMapCmdArgs),
}
/// Create new mapping which cannot exist already.
#[derive(clap::Args)]
#[clap(about = "Create new mapping which cannot exist already")]
struct MappingsMapCmdArgs {
/// Tenant ID, as a 32-byte hexadecimal string.
#[clap(long)]
#[clap(
long,
help = "Tenant id. Represented as a hexadecimal string 32 symbols length"
)]
tenant_id: TenantId,
/// Timeline ID, as a 32-byte hexadecimal string.
#[clap(long)]
#[clap(
long,
help = "Timeline id. Represented as a hexadecimal string 32 symbols length"
)]
timeline_id: TimelineId,
/// Branch name to give to the timeline.
#[clap(long)]
#[clap(long, help = "Branch name to give to the timeline")]
branch_name: String,
}

View File

@@ -120,11 +120,6 @@
"value": "host=pageserver port=6400",
"vartype": "string"
},
{
"name": "neon.pageserver_grpc_urls",
"value": "grpc://pageserver:6401/",
"vartype": "string"
},
{
"name": "max_replication_write_lag",
"value": "500MB",

View File

@@ -1,7 +1,6 @@
broker_endpoint='http://storage_broker:50051'
pg_distrib_dir='/usr/local/'
listen_pg_addr='0.0.0.0:6400'
listen_grpc_addr='0.0.0.0:6401'
listen_http_addr='0.0.0.0:9898'
remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }
control_plane_api='http://0.0.0.0:6666' # No storage controller in docker compose, specify a junk address

View File

@@ -1,246 +0,0 @@
# Node deletion API improvement
Created on 2025-07-07
Implemented on _TBD_
## Summary
This RFC describes improvements to the storage controller API for gracefully deleting pageserver
nodes.
## Motivation
The basic node deletion API introduced in [#8226](https://github.com/neondatabase/neon/issues/8333)
has several limitations:
- Deleted nodes can re-add themselves if they restart (e.g., a flaky node that keeps restarting and
we cannot reach via SSH to stop the pageserver). This issue has been resolved by tombstone
mechanism in [#12036](https://github.com/neondatabase/neon/issues/12036)
- Process of node deletion is not graceful, i.e. it just imitates a node failure
In this context, "graceful" node deletion means that users do not experience any disruption or
negative effects, provided the system remains in a healthy state (i.e., the remaining pageservers
can handle the workload and all requirements are met). To achieve this, the system must perform
live migration of all tenant shards from the node being deleted while the node is still running
and continue processing all incoming requests. The node is removed only after all tenant shards
have been safely migrated.
Although live migrations can be achieved with the drain functionality, it leads to incorrect shard
placement, such as not matching availability zones. This results in unnecessary work to optimize
the placement that was just recently performed.
If we delete a node before its tenant shards are fully moved, the new node won't have all the
needed data (e.g. heatmaps) ready. This means user requests to the new node will be much slower at
first. If there are many tenant shards, this slowdown affects a huge amount of users.
Graceful node deletion is more complicated and can introduce new issues. It takes longer because
live migration of each tenant shard can last several minutes. Using non-blocking accessors may
also cause deletion to wait if other processes are holding inner state lock. It also gets trickier
because we need to handle other requests, like drain and fill, at the same time.
## Impacted components (e.g. pageserver, safekeeper, console, etc)
- storage controller
- pageserver (indirectly)
## Proposed implementation
### Tombstones
To resolve the problem of deleted nodes re-adding themselves, a tombstone mechanism was introduced
as part of the node stored information. Each node has a separate `NodeLifecycle` field with two
possible states: `Active` and `Deleted`. When node deletion completes, the database row is not
deleted but instead has its `NodeLifecycle` column switched to `Deleted`. Nodes with `Deleted`
lifecycle are treated as if the row is absent for most handlers, with several exceptions: reattach
and register functionality must be aware of tombstones. Additionally, new debug handlers are
available for listing and deleting tombstones via the `/debug/v1/tombstone` path.
### Gracefulness
The problem of making node deletion graceful is complex and involves several challenges:
- **Cancellable**: The operation must be cancellable to allow administrators to abort the process
if needed, e.g. if run by mistake.
- **Non-blocking**: We don't want to block deployment operations like draining/filling on the node
deletion process. We need clear policies for handling concurrent operations: what happens when a
drain/fill request arrives while deletion is in progress, and what happens when a delete request
arrives while drain/fill is in progress.
- **Persistent**: If the storage controller restarts during this long-running operation, we must
preserve progress and automatically resume the deletion process after the storage controller
restarts.
- **Migrated correctly**: We cannot simply use the existing drain mechanism for nodes scheduled
for deletion, as this would move shards to irrelevant locations. The drain process expects the
node to return, so it only moves shards to backup locations, not to their preferred AZs. It also
leaves secondary locations unmoved. This could result in unnecessary load on the storage
controller and inefficient resource utilization.
- **Force option**: Administrators need the ability to force immediate, non-graceful deletion when
time constraints or emergency situations require it, bypassing the normal graceful migration
process.
See below for a detailed breakdown of the proposed changes and mechanisms.
#### Node lifecycle
New `NodeLifecycle` enum and a matching database field with these values:
- `Active`: The normal state. All operations are allowed.
- `ScheduledForDeletion`: The node is marked to be deleted soon. Deletion may be in progress or
will happen later, but the node will eventually be removed. All operations are allowed.
- `Deleted`: The node is fully deleted. No operations are allowed, and the node cannot be brought
back. The only action left is to remove its record from the database. Any attempt to register a
node in this state will fail.
This state persists across storage controller restarts.
**State transition**
```
+--------------------+
+---| Active |<---------------------+
| +--------------------+ |
| ^ |
| start_node_delete | cancel_node_delete |
v | |
+----------------------------------+ |
| ScheduledForDeletion | |
+----------------------------------+ |
| |
| node_register |
| |
| delete_node (at the finish) |
| |
v |
+---------+ tombstone_delete +----------+
| Deleted |-------------------------------->| no row |
+---------+ +----------+
```
#### NodeSchedulingPolicy::Deleting
A `Deleting` variant to the `NodeSchedulingPolicy` enum. This means the deletion function is
running for the node right now. Only one node can have the `Deleting` policy at a time.
The `NodeSchedulingPolicy::Deleting` state is persisted in the database. However, after a storage
controller restart, any node previously marked as `Deleting` will have its scheduling policy reset
to `Pause`. The policy will only transition back to `Deleting` when the deletion operation is
actively started again, as triggered by the node's `NodeLifecycle::ScheduledForDeletion` state.
`NodeSchedulingPolicy` transition details:
1. When `node_delete` begins, set the policy to `NodeSchedulingPolicy::Deleting`.
2. If `node_delete` is cancelled (for example, due to a concurrent drain operation), revert the
policy to its previous value. The policy is persisted in storcon DB.
3. After `node_delete` completes, the final value of the scheduling policy is irrelevant, since
`NodeLifecycle::Deleted` prevents any further access to this field.
The deletion process cannot be initiated for nodes currently undergoing deployment-related
operations (`Draining`, `Filling`, or `PauseForRestart` policies). Deletion will only be triggered
once the node transitions to either the `Active` or `Pause` state.
#### OperationTracker
A replacement for `Option<OperationHandler> ongoing_operation`, the `OperationTracker` is a
dedicated service state object responsible for managing all long-running node operations (drain,
fill, delete) with robust concurrency control.
Key responsibilities:
- Orchestrates the execution of operations
- Supports cancellation of currently running operations
- Enforces operation constraints, e.g. allowing only single drain/fill operation at a time
- Persists deletion state, enabling recovery of pending deletions across restarts
- Ensures thread safety across concurrent requests
#### Attached tenant shard processing
When deleting a node, handle each attached tenant shard as follows:
1. Pick the best node to become the new attached (the candidate).
2. If the candidate already has this shard as a secondary:
- Create a new secondary for the shard on another suitable node.
Otherwise:
- Create a secondary for the shard on the candidate node.
3. Wait until all secondaries are ready and pre-warmed.
4. Promote the candidate's secondary to attached.
5. Remove the secondary from the node being deleted.
This process safely moves all attached shards before deleting the node.
#### Secondary tenant shard processing
When deleting a node, handle each secondary tenant shard as follows:
1. Choose the best node to become the new secondary.
2. Create a secondary for the shard on that node.
3. Wait until the new secondary is ready.
4. Remove the secondary from the node being deleted.
This ensures all secondary shards are safely moved before deleting the node.
### Reliability, failure modes and corner cases
In case of a storage controller failure and following restart, the system behavior depends on the
`NodeLifecycle` state:
- If `NodeLifecycle` is `Active`: No action is taken for this node.
- If `NodeLifecycle` is `Deleted`: The node will not be re-added.
- If `NodeLifecycle` is `ScheduledForDeletion`: A deletion background task will be launched for
this node.
In case of a pageserver node failure during deletion, the behavior depends on the `force` flag:
- If `force` is set: The node deletion will proceed regardless of the node's availability.
- If `force` is not set: The deletion will be retried a limited number of times. If the node
remains unavailable, the deletion process will pause and automatically resume when the node
becomes healthy again.
### Operations concurrency
The following sections describe the behavior when different types of requests arrive at the storage
controller and how they interact with ongoing operations.
#### Delete request
Handler: `PUT /control/v1/node/:node_id/delete`
1. If node lifecycle is `NodeLifecycle::ScheduledForDeletion`:
- Return `200 OK`: there is already an ongoing deletion request for this node
2. Update & persist lifecycle to `NodeLifecycle::ScheduledForDeletion`
3. Persist current scheduling policy
4. If there is no active operation (drain/fill/delete):
- Run deletion process for this node
#### Cancel delete request
Handler: `DELETE /control/v1/node/:node_id/delete`
1. If node lifecycle is not `NodeLifecycle::ScheduledForDeletion`:
- Return `404 Not Found`: there is no current deletion request for this node
2. If the active operation is deleting this node, cancel it
3. Update & persist lifecycle to `NodeLifecycle::Active`
4. Restore the last scheduling policy from persistence
#### Drain/fill request
1. If there are already ongoing drain/fill processes:
- Return `409 Conflict`: queueing of drain/fill processes is not supported
2. If there is an ongoing delete process:
- Cancel it and wait until it is cancelled
3. Run the drain/fill process
4. After the drain/fill process is cancelled or finished:
- Try to find another candidate to delete and run the deletion process for that node
#### Drain/fill cancel request
1. If the active operation is not the related process:
- Return `400 Bad Request`: cancellation request is incorrect, operations are not the same
2. Cancel the active operation
3. Try to find another candidate to delete and run the deletion process for that node
## Definition of Done
- [x] Fix flaky node scenario and introduce related debug handlers
- [ ] Node deletion intent is persistent - a node will be eventually deleted after a deletion
request regardless of draining/filling requests and restarts
- [ ] Node deletion can be graceful - deletion completes only after moving all tenant shards to
recommended locations
- [ ] Deploying does not break due to long deletions - drain/fill operations override deletion
process and deletion resumes after drain/fill completes
- [ ] `force` flag is implemented and provides fast, failure-tolerant node removal (e.g., when a
pageserver node does not respond)
- [ ] Legacy delete handler code is removed from storage_controller, test_runner, and storcon_cli

View File

@@ -68,15 +68,11 @@ pub enum LfcPrewarmState {
/// We tried to fetch the corresponding LFC state from the endpoint storage,
/// but received `Not Found 404`. This should normally happen only during the
/// first endpoint start after creation with `autoprewarm: true`.
/// This may also happen if LFC is turned off or not initialized
///
/// During the orchestrated prewarm via API, when a caller explicitly
/// provides the LFC state key to prewarm from, it's the caller responsibility
/// to handle this status as an error state in this case.
Skipped,
/// LFC prewarm was cancelled. Some pages in LFC cache may be prewarmed if query
/// has started working before cancellation
Cancelled,
}
impl Display for LfcPrewarmState {
@@ -87,7 +83,6 @@ impl Display for LfcPrewarmState {
LfcPrewarmState::Completed => f.write_str("Completed"),
LfcPrewarmState::Skipped => f.write_str("Skipped"),
LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
LfcPrewarmState::Cancelled => f.write_str("Cancelled"),
}
}
}
@@ -102,7 +97,6 @@ pub enum LfcOffloadState {
Failed {
error: String,
},
Skipped,
}
#[derive(Serialize, Debug, Clone, PartialEq)]

View File

@@ -16,5 +16,5 @@ rustc-hash.workspace = true
tempfile = "3.14.0"
[dev-dependencies]
rand = "0.9"
rand_distr = "0.5.1"
rand.workspace = true
rand_distr = "0.5.1"

View File

@@ -1,330 +0,0 @@
use criterion::{BatchSize, BenchmarkId, Criterion, criterion_group, criterion_main};
use neon_shmem::hash::HashMapAccess;
use neon_shmem::hash::HashMapInit;
use neon_shmem::hash::entry::Entry;
use rand::distr::{Distribution, StandardUniform};
use rand::prelude::*;
use std::default::Default;
use std::hash::BuildHasher;
// Taken from bindings to C code
#[derive(Clone, Debug, Hash, Eq, PartialEq)]
#[repr(C)]
pub struct FileCacheKey {
pub _spc_id: u32,
pub _db_id: u32,
pub _rel_number: u32,
pub _fork_num: u32,
pub _block_num: u32,
}
impl Distribution<FileCacheKey> for StandardUniform {
// questionable, but doesn't need to be good randomness
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> FileCacheKey {
FileCacheKey {
_spc_id: rng.random(),
_db_id: rng.random(),
_rel_number: rng.random(),
_fork_num: rng.random(),
_block_num: rng.random(),
}
}
}
#[derive(Clone, Debug)]
#[repr(C)]
pub struct FileCacheEntry {
pub _offset: u32,
pub _access_count: u32,
pub _prev: *mut FileCacheEntry,
pub _next: *mut FileCacheEntry,
pub _state: [u32; 8],
}
impl FileCacheEntry {
fn dummy() -> Self {
Self {
_offset: 0,
_access_count: 0,
_prev: std::ptr::null_mut(),
_next: std::ptr::null_mut(),
_state: [0; 8],
}
}
}
// Utilities for applying operations.
#[derive(Clone, Debug)]
struct TestOp<K, V>(K, Option<V>);
fn apply_op<K: Clone + std::hash::Hash + Eq, V, S: std::hash::BuildHasher>(
op: TestOp<K, V>,
map: &mut HashMapAccess<K, V, S>,
) {
let entry = map.entry(op.0);
match op.1 {
Some(new) => match entry {
Entry::Occupied(mut e) => Some(e.insert(new)),
Entry::Vacant(e) => {
_ = e.insert(new).unwrap();
None
}
},
None => match entry {
Entry::Occupied(e) => Some(e.remove()),
Entry::Vacant(_) => None,
},
};
}
// Hash utilities
struct SeaRandomState {
k1: u64,
k2: u64,
k3: u64,
k4: u64,
}
impl std::hash::BuildHasher for SeaRandomState {
type Hasher = seahash::SeaHasher;
fn build_hasher(&self) -> Self::Hasher {
seahash::SeaHasher::with_seeds(self.k1, self.k2, self.k3, self.k4)
}
}
impl SeaRandomState {
fn new() -> Self {
let mut rng = rand::rng();
Self {
k1: rng.random(),
k2: rng.random(),
k3: rng.random(),
k4: rng.random(),
}
}
}
fn small_benchs(c: &mut Criterion) {
let mut group = c.benchmark_group("Small maps");
group.sample_size(10);
group.bench_function("small_rehash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2).attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("small_rehash_xxhash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2)
.with_hasher(twox_hash::xxhash64::RandomState::default())
.attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("small_rehash_ahash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2)
.with_hasher(ahash::RandomState::default())
.attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("small_rehash_seahash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2)
.with_hasher(SeaRandomState::new())
.attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.finish();
}
fn real_benchs(c: &mut Criterion) {
let mut group = c.benchmark_group("Realistic workloads");
group.sample_size(10);
group.bench_function("real_bulk_insert", |b| {
let size = 125_000_000;
let ideal_filled = 100_000_000;
let mut rng = rand::rng();
b.iter_batched(
|| HashMapInit::new_resizeable(size, size * 2).attach_writer(),
|writer| {
for _ in 0..ideal_filled {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
let entry = writer.entry(key);
match entry {
Entry::Occupied(mut e) => {
std::hint::black_box(e.insert(val));
}
Entry::Vacant(e) => {
let _ = std::hint::black_box(e.insert(val).unwrap());
}
}
}
},
BatchSize::SmallInput,
)
});
group.bench_function("real_rehash", |b| {
let size = 125_000_000;
let ideal_filled = 100_000_000;
let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("real_rehash_hashbrown", |b| {
let size = 125_000_000;
let ideal_filled = 100_000_000;
let mut writer = hashbrown::raw::RawTable::new();
let mut rng = rand::rng();
let hasher = rustc_hash::FxBuildHasher;
unsafe {
writer
.resize(
size,
|(k, _)| hasher.hash_one(k),
hashbrown::raw::Fallibility::Infallible,
)
.unwrap();
}
while writer.len() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
hasher.hash_one(k)
});
}
b.iter(|| unsafe {
writer.table.rehash_in_place(
&|table, index| {
hasher.hash_one(
&table
.bucket::<(FileCacheKey, FileCacheEntry)>(index)
.as_ref()
.0,
)
},
std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
Some(|ptr| std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry)))
} else {
None
},
)
});
});
for elems in [2, 4, 8, 16, 32, 64, 96, 112] {
group.bench_with_input(
BenchmarkId::new("real_rehash_varied", elems),
&elems,
|b, &size| {
let ideal_filled = size * 1_000_000;
let size = 125_000_000;
let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
},
);
group.bench_with_input(
BenchmarkId::new("real_rehash_varied_hashbrown", elems),
&elems,
|b, &size| {
let ideal_filled = size * 1_000_000;
let size = 125_000_000;
let mut writer = hashbrown::raw::RawTable::new();
let mut rng = rand::rng();
let hasher = rustc_hash::FxBuildHasher;
unsafe {
writer
.resize(
size,
|(k, _)| hasher.hash_one(k),
hashbrown::raw::Fallibility::Infallible,
)
.unwrap();
}
while writer.len() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
hasher.hash_one(k)
});
}
b.iter(|| unsafe {
writer.table.rehash_in_place(
&|table, index| {
hasher.hash_one(
&table
.bucket::<(FileCacheKey, FileCacheEntry)>(index)
.as_ref()
.0,
)
},
std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
Some(|ptr| {
std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry))
})
} else {
None
},
)
});
},
);
}
group.finish();
}
criterion_group!(benches, small_benchs, real_benchs);
criterion_main!(benches);

View File

@@ -16,7 +16,6 @@
//!
//! Concurrency is managed very simply: the entire map is guarded by one shared-memory RwLock.
use std::fmt::Debug;
use std::hash::{BuildHasher, Hash};
use std::mem::MaybeUninit;
@@ -41,75 +40,45 @@ pub enum HashMapShrinkError {
#[error("shmem resize failed: {0}")]
ResizeError(shmem::Error),
/// Occupied entries in to-be-shrunk space were encountered beginning at the given index.
#[error("occupied entry in deallocated space found at {2} (in deallocated range of {0}..{1})")]
RemainingEntries(usize, usize, usize),
#[error("occupied entry in deallocated space found at {0}")]
RemainingEntries(usize),
}
/// This represents a hash table that (possibly) lives in shared memory.
/// If a new process is launched with fork(), the child process inherits
/// this struct.
#[must_use]
pub struct HashMapInit<K: 'static, V: 'static, S = rustc_hash::FxBuildHasher> {
pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> {
shmem_handle: Option<ShmemHandle>,
shared_ptr: *mut HashMapShared<K, V>,
shared_ptr: *mut HashMapShared<'a, K, V>,
shared_size: usize,
hasher: S,
num_buckets: u32,
}
impl<K, V, S> Debug for HashMapInit<K, V, S>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("HashMapInit")
.field("shmem_handle", &self.shmem_handle)
.field("shared_ptr", &self.shared_ptr)
.field("shared_size", &self.shared_size)
// .field("hasher", &self.hasher)
.field("num_buckets", &self.num_buckets)
.finish()
}
}
/// This is a per-process handle to a hash table that (possibly) lives in shared memory.
/// If a child process is launched with fork(), the child process should
/// get its own HashMapAccess by calling HashMapInit::attach_writer/reader().
///
/// XXX: We're not making use of it at the moment, but this struct could
/// hold process-local information in the future.
pub struct HashMapAccess<K: 'static, V: 'static, S = rustc_hash::FxBuildHasher> {
pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> {
shmem_handle: Option<ShmemHandle>,
shared_ptr: *mut HashMapShared<K, V>,
shared_ptr: *mut HashMapShared<'a, K, V>,
hasher: S,
}
unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<K, V, S> {}
unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<K, V, S> {}
unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<'_, K, V, S> {}
unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<'_, K, V, S> {}
impl<K, V, S> Debug for HashMapAccess<K, V, S>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("HashMapAccess")
.field("shmem_handle", &self.shmem_handle)
.field("shared_ptr", &self.shared_ptr)
// .field("hasher", &self.hasher)
.finish()
}
}
impl<K: Clone + Hash + Eq + 'static, V: 'static, S> HashMapInit<K, V, S> {
impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
/// Change the 'hasher' used by the hash table.
///
/// NOTE: This must be called right after creating the hash table,
/// before inserting any entries and before calling attach_writer/reader.
/// Otherwise different accessors could be using different hash function,
/// with confusing results.
pub fn with_hasher<T: BuildHasher>(self, hasher: T) -> HashMapInit<K, V, T> {
pub fn with_hasher<T: BuildHasher>(self, hasher: T) -> HashMapInit<'a, K, V, T> {
HashMapInit {
hasher,
shmem_handle: self.shmem_handle,
@@ -177,7 +146,7 @@ impl<K: Clone + Hash + Eq + 'static, V: 'static, S> HashMapInit<K, V, S> {
}
/// Attach to a hash table for writing.
pub fn attach_writer(self) -> HashMapAccess<K, V, S> {
pub fn attach_writer(self) -> HashMapAccess<'a, K, V, S> {
HashMapAccess {
shmem_handle: self.shmem_handle,
shared_ptr: self.shared_ptr,
@@ -189,7 +158,7 @@ impl<K: Clone + Hash + Eq + 'static, V: 'static, S> HashMapInit<K, V, S> {
///
/// This is a holdover from a previous implementation and is being kept around for
/// backwards compatibility reasons.
pub fn attach_reader(self) -> HashMapAccess<K, V, S> {
pub fn attach_reader(self) -> HashMapAccess<'a, K, V, S> {
self.attach_writer()
}
}
@@ -206,14 +175,14 @@ impl<K: Clone + Hash + Eq + 'static, V: 'static, S> HashMapInit<K, V, S> {
/// dictionary
///
/// In between the above parts, there can be padding bytes to align the parts correctly.
type HashMapShared<K, V> = RwLock<CoreHashMap<K, V>>;
type HashMapShared<'a, K, V> = RwLock<CoreHashMap<'a, K, V>>;
impl<K, V: 'static> HashMapInit<K, V, rustc_hash::FxBuildHasher>
impl<'a, K, V> HashMapInit<'a, K, V, rustc_hash::FxBuildHasher>
where
K: Clone + Hash + Eq + 'static,
K: Clone + Hash + Eq,
{
/// Place the hash table within a user-supplied fixed memory area.
pub fn with_fixed(num_buckets: u32, area: &'static mut [MaybeUninit<u8>]) -> Self {
pub fn with_fixed(num_buckets: u32, area: &'a mut [MaybeUninit<u8>]) -> Self {
Self::new(
num_buckets,
None,
@@ -269,10 +238,9 @@ where
}
}
impl<K, V, S: BuildHasher> HashMapAccess<K, V, S>
impl<'a, K, V, S: BuildHasher> HashMapAccess<'a, K, V, S>
where
K: Clone + Hash + Eq + 'static,
V: 'static
K: Clone + Hash + Eq,
{
/// Hash a key using the map's hasher.
#[inline]
@@ -280,7 +248,7 @@ where
self.hasher.hash_one(key)
}
fn entry_with_hash(&self, key: K, hash: u64) -> Entry<'_, K, V> {
fn entry_with_hash(&self, key: K, hash: u64) -> Entry<'a, '_, K, V> {
let mut map = unsafe { self.shared_ptr.as_ref() }.unwrap().write();
let dict_pos = hash as usize % map.dictionary.len();
let first = map.dictionary[dict_pos];
@@ -330,9 +298,9 @@ where
/// Get a reference to the entry containing a key.
///
/// NB: This takes a write lock as there's no way to distinguish whether the intention
/// NB: THis takes a write lock as there's no way to distinguish whether the intention
/// is to use the entry for reading or for writing in advance.
pub fn entry(&self, key: K) -> Entry<'_, K, V> {
pub fn entry(&self, key: K) -> Entry<'a, '_, K, V> {
let hash = self.get_hash_value(&key);
self.entry_with_hash(key, hash)
}
@@ -366,7 +334,7 @@ where
/// Has more overhead than one would intuitively expect: performs both a clone of the key
/// due to the [`OccupiedEntry`] type owning the key and also a hash of the key in order
/// to enable repairing the hash chain if the entry is removed.
pub fn entry_at_bucket(&self, pos: usize) -> Option<OccupiedEntry<'_, K, V>> {
pub fn entry_at_bucket(&self, pos: usize) -> Option<OccupiedEntry<'a, '_, K, V>> {
let map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
if pos >= map.buckets.len() {
return None;
@@ -390,16 +358,6 @@ where
map.get_num_buckets()
}
/// Returns the logical number of buckets in the table (aka the amount of allocatable buckets).
pub fn get_num_logical_buckets(&self) -> usize {
let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
if map.alloc_limit == INVALID_POS {
map.get_num_buckets()
} else {
map.alloc_limit as usize
}
}
/// Return the key and value stored in bucket with given index. This can be used to
/// iterate through the hash map.
// TODO: An Iterator might be nicer. The communicator's clock algorithm needs to
@@ -441,7 +399,7 @@ where
/// in the process.
fn rehash_dict(
&self,
inner: &mut CoreHashMap<K, V>,
inner: &mut CoreHashMap<'a, K, V>,
buckets_ptr: *mut core::Bucket<K, V>,
end_ptr: *mut u8,
num_buckets: u32,
@@ -552,12 +510,12 @@ where
/// # Panics
/// Panics if called on a map initialized with [`HashMapInit::with_fixed`] or if `num_buckets` is
/// greater than the number of buckets in the map.
pub fn begin_shrink(&self, num_buckets: u32) {
pub fn begin_shrink(&mut self, num_buckets: u32) {
let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
// assert!(
// num_buckets <= map.get_num_buckets() as u32,
// "shrink called with a larger number of buckets"
// );
assert!(
num_buckets <= map.get_num_buckets() as u32,
"shrink called with a larger number of buckets"
);
_ = self
.shmem_handle
.as_ref()
@@ -602,9 +560,7 @@ where
for i in (num_buckets as usize)..map.buckets.len() {
if map.buckets[i].inner.is_some() {
return Err(HashMapShrinkError::RemainingEntries(
num_buckets as usize, map.buckets.len(), i)
);
return Err(HashMapShrinkError::RemainingEntries(i));
}
}

View File

@@ -1,6 +1,5 @@
//! Simple hash table with chaining.
use std::fmt::Debug;
use std::hash::Hash;
use std::mem::MaybeUninit;
@@ -18,25 +17,12 @@ pub(crate) struct Bucket<K, V> {
pub(crate) inner: Option<(K, V)>,
}
impl<K, V> Debug for Bucket<K, V>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Bucket")
.field("next", &self.next)
.field("inner", &self.inner)
.finish()
}
}
/// Core hash table implementation.
pub(crate) struct CoreHashMap<K: 'static, V: 'static> {
pub(crate) struct CoreHashMap<'a, K, V> {
/// Dictionary used to map hashes to bucket indices.
pub(crate) dictionary: &'static mut [u32],
pub(crate) dictionary: &'a mut [u32],
/// Buckets containing key-value pairs.
pub(crate) buckets: &'static mut [Bucket<K, V>],
pub(crate) buckets: &'a mut [Bucket<K, V>],
/// Head of the freelist.
pub(crate) free_head: u32,
/// Maximum index of a bucket allowed to be allocated. [`INVALID_POS`] if no limit.
@@ -45,27 +31,11 @@ pub(crate) struct CoreHashMap<K: 'static, V: 'static> {
pub(crate) buckets_in_use: u32,
}
impl<K, V> Debug for CoreHashMap<K, V>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("CoreHashMap")
.field("dictionary", &self.dictionary)
.field("buckets", &self.buckets)
.field("free_head", &self.free_head)
.field("alloc_limit", &self.alloc_limit)
.field("buckets_in_use", &self.buckets_in_use)
.finish()
}
}
/// Error for when there are no empty buckets left but one is needed.
#[derive(Debug, PartialEq)]
pub struct FullError;
impl<K: Clone + Hash + Eq, V> CoreHashMap<K, V> {
impl<'a, K: Clone + Hash + Eq, V> CoreHashMap<'a, K, V> {
const FILL_FACTOR: f32 = 0.60;
/// Estimate the size of data contained within the the hash map.
@@ -83,8 +53,8 @@ impl<K: Clone + Hash + Eq, V> CoreHashMap<K, V> {
}
pub fn new(
buckets: &'static mut [MaybeUninit<Bucket<K, V>>],
dictionary: &'static mut [MaybeUninit<u32>],
buckets: &'a mut [MaybeUninit<Bucket<K, V>>],
dictionary: &'a mut [MaybeUninit<u32>],
) -> Self {
// Initialize the buckets
for i in 0..buckets.len() {

View File

@@ -6,9 +6,9 @@ use crate::sync::{RwLockWriteGuard, ValueWriteGuard};
use std::hash::Hash;
use std::mem;
pub enum Entry<'b, K: 'static, V: 'static> {
Occupied(OccupiedEntry<'b, K, V>),
Vacant(VacantEntry<'b, K, V>),
pub enum Entry<'a, 'b, K, V> {
Occupied(OccupiedEntry<'a, 'b, K, V>),
Vacant(VacantEntry<'a, 'b, K, V>),
}
/// Enum representing the previous position within a chain.
@@ -22,9 +22,9 @@ pub(crate) enum PrevPos {
Unknown(u64),
}
pub struct OccupiedEntry<'b, K: 'static, V: 'static> {
pub struct OccupiedEntry<'a, 'b, K, V> {
/// Mutable reference to the map containing this entry.
pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<K, V>>,
pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>,
/// The key of the occupied entry
pub(crate) _key: K,
/// The index of the previous entry in the chain.
@@ -33,7 +33,7 @@ pub struct OccupiedEntry<'b, K: 'static, V: 'static> {
pub(crate) bucket_pos: u32,
}
impl<K: 'static, V: 'static> OccupiedEntry<'_, K, V> {
impl<K, V> OccupiedEntry<'_, '_, K, V> {
pub fn get(&self) -> &V {
&self.map.buckets[self.bucket_pos as usize]
.inner
@@ -61,10 +61,6 @@ impl<K: 'static, V: 'static> OccupiedEntry<'_, K, V> {
///
/// This may result in multiple bucket accesses if the entry was obtained by index as the
/// previous chain entry needs to be discovered in this case.
///
/// # Panics
/// Panics if the `prev_pos` field is equal to [`PrevPos::Unknown`]. In practice, this means
/// the entry was obtained via calling something like [`super::HashMapAccess::entry_at_bucket`].
pub fn remove(mut self) -> V {
// If this bucket was queried by index, go ahead and follow its chain from the start.
let prev = if let PrevPos::Unknown(hash) = self.prev_pos {
@@ -108,16 +104,16 @@ impl<K: 'static, V: 'static> OccupiedEntry<'_, K, V> {
}
/// An abstract view into a vacant entry within the map.
pub struct VacantEntry<'b, K: 'static, V: 'static> {
pub struct VacantEntry<'a, 'b, K, V> {
/// Mutable reference to the map containing this entry.
pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<K, V>>,
pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>,
/// The key to be inserted into this entry.
pub(crate) key: K,
/// The position within the dictionary corresponding to the key's hash.
pub(crate) dict_pos: u32,
}
impl<'b, K: Clone + Hash + Eq + 'static, V: 'static> VacantEntry<'b, K, V> {
impl<'b, K: Clone + Hash + Eq, V> VacantEntry<'_, 'b, K, V> {
/// Insert a value into the vacant entry, finding and populating an empty bucket in the process.
///
/// # Errors

View File

@@ -21,7 +21,6 @@ use nix::unistd::ftruncate as nix_ftruncate;
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
/// will cause the file to be expanded, but we might use `mprotect()` etc. to enforce that in the
/// future.
#[derive(Debug)]
pub struct ShmemHandle {
/// memfd file descriptor
fd: OwnedFd,
@@ -36,7 +35,6 @@ pub struct ShmemHandle {
}
/// This is stored at the beginning in the shared memory area.
#[derive(Debug)]
struct SharedStruct {
max_size: usize,

View File

@@ -310,11 +310,6 @@ impl AtomicLsn {
}
}
/// Consumes the atomic and returns the contained value.
pub const fn into_inner(self) -> Lsn {
Lsn(self.inner.into_inner())
}
/// Atomically retrieve the `Lsn` value from memory.
pub fn load(&self) -> Lsn {
Lsn(self.inner.load(Ordering::Acquire))

View File

@@ -341,34 +341,6 @@ extern "C-unwind" fn log_internal(
}
}
/* BEGIN_HADRON */
extern "C" fn reset_safekeeper_statuses_for_metrics(wp: *mut WalProposer, num_safekeepers: u32) {
unsafe {
let callback_data = (*(*wp).config).callback_data;
let api = callback_data as *mut Box<dyn ApiImpl>;
if api.is_null() {
return;
}
(*api).reset_safekeeper_statuses_for_metrics(&mut (*wp), num_safekeepers);
}
}
extern "C" fn update_safekeeper_status_for_metrics(
wp: *mut WalProposer,
sk_index: u32,
status: u8,
) {
unsafe {
let callback_data = (*(*wp).config).callback_data;
let api = callback_data as *mut Box<dyn ApiImpl>;
if api.is_null() {
return;
}
(*api).update_safekeeper_status_for_metrics(&mut (*wp), sk_index, status);
}
}
/* END_HADRON */
#[derive(Debug, PartialEq)]
pub enum Level {
Debug5,
@@ -442,10 +414,6 @@ pub(crate) fn create_api() -> walproposer_api {
finish_sync_safekeepers: Some(finish_sync_safekeepers),
process_safekeeper_feedback: Some(process_safekeeper_feedback),
log_internal: Some(log_internal),
/* BEGIN_HADRON */
reset_safekeeper_statuses_for_metrics: Some(reset_safekeeper_statuses_for_metrics),
update_safekeeper_status_for_metrics: Some(update_safekeeper_status_for_metrics),
/* END_HADRON */
}
}
@@ -483,8 +451,6 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
replica_promote: false,
min_ps_feedback: empty_feedback,
wal_rate_limiter: empty_wal_rate_limiter,
num_safekeepers: 0,
safekeeper_status: [0; 32],
}
}

View File

@@ -159,21 +159,6 @@ pub trait ApiImpl {
fn after_election(&self, _wp: &mut WalProposer) {
todo!()
}
/* BEGIN_HADRON */
fn reset_safekeeper_statuses_for_metrics(&self, _wp: &mut WalProposer, _num_safekeepers: u32) {
// Do nothing for testing purposes.
}
fn update_safekeeper_status_for_metrics(
&self,
_wp: &mut WalProposer,
_sk_index: u32,
_status: u8,
) {
// Do nothing for testing purposes.
}
/* END_HADRON */
}
#[derive(Debug)]

View File

@@ -54,7 +54,6 @@ pageserver_api.workspace = true
pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
pageserver_compaction.workspace = true
pageserver_page_api.workspace = true
peekable.workspace = true
pem.workspace = true
pin-project-lite.workspace = true
postgres_backend.workspace = true
@@ -67,7 +66,6 @@ postgres-types.workspace = true
posthog_client_lite.workspace = true
pprof.workspace = true
pq_proto.workspace = true
prost.workspace = true
rand.workspace = true
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
regex.workspace = true

View File

@@ -3,4 +3,3 @@ mod pool;
mod retry;
pub use client::{PageserverClient, ShardSpec};
pub use pageserver_api::shard::ShardStripeSize; // used in ShardSpec

View File

@@ -33,8 +33,6 @@ pub enum ProtocolError {
Invalid(&'static str, String),
#[error("required field '{0}' is missing")]
Missing(&'static str),
#[error("invalid combination of not_modified_lsn '{0}' and request_lsn '{1}'")]
InvalidLsns(Lsn, Lsn),
}
impl ProtocolError {
@@ -87,9 +85,9 @@ impl TryFrom<proto::ReadLsn> for ReadLsn {
return Err(ProtocolError::invalid("request_lsn", pb.request_lsn));
}
if pb.not_modified_since_lsn > pb.request_lsn {
return Err(ProtocolError::InvalidLsns(
Lsn(pb.not_modified_since_lsn),
Lsn(pb.request_lsn),
return Err(ProtocolError::invalid(
"not_modified_since_lsn",
pb.not_modified_since_lsn,
));
}
Ok(Self {

View File

@@ -25,9 +25,6 @@ tracing.workspace = true
tokio.workspace = true
tokio-stream.workspace = true
tokio-util.workspace = true
axum.workspace = true
http.workspace = true
metrics.workspace = true
tonic.workspace = true
url.workspace = true

View File

@@ -34,10 +34,6 @@ use crate::util::{request_stats, tokio_thread_local_stats};
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
#[derive(clap::Parser)]
pub(crate) struct Args {
#[clap(long, default_value = "false")]
grpc: bool,
#[clap(long, default_value = "false")]
grpc_stream: bool,
#[clap(long, default_value = "http://localhost:9898")]
mgmt_api_endpoint: String,
/// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
@@ -82,9 +78,6 @@ pub(crate) struct Args {
#[clap(long)]
set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,
#[clap(long)]
only_relnode: Option<u32>,
/// Queue depth generated in each client.
#[clap(long, default_value = "1")]
queue_depth: NonZeroUsize,
@@ -99,31 +92,10 @@ pub(crate) struct Args {
#[clap(long, default_value = "1")]
batch_size: NonZeroUsize,
#[clap(long)]
only_relnode: Option<u32>,
targets: Option<Vec<TenantTimelineId>>,
#[clap(long, default_value = "100")]
pool_max_consumers: NonZeroUsize,
#[clap(long, default_value = "5")]
pool_error_threshold: NonZeroUsize,
#[clap(long, default_value = "5000")]
pool_connect_timeout: NonZeroUsize,
#[clap(long, default_value = "1000")]
pool_connect_backoff: NonZeroUsize,
#[clap(long, default_value = "60000")]
pool_max_idle_duration: NonZeroUsize,
#[clap(long, default_value = "0")]
max_delay_ms: usize,
#[clap(long, default_value = "0")]
percent_drops: usize,
#[clap(long, default_value = "0")]
percent_hangs: usize,
}
/// State shared by all clients
@@ -180,6 +152,7 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
main_impl(args, thread_local_stats)
})
}
async fn main_impl(
args: Args,
all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
@@ -344,7 +317,6 @@ async fn main_impl(
let rps_period = args
.per_client_rate
.map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
let ss = shared_state.clone();
let cancel = cancel.clone();

View File

@@ -453,7 +453,6 @@ impl TimelineHandles {
handles: Default::default(),
}
}
async fn get(
&mut self,
tenant_id: TenantId,

View File

@@ -5,12 +5,10 @@ MODULE_big = neon
OBJS = \
$(WIN32RES) \
communicator.o \
communicator_new.o \
communicator_process.o \
extension_server.o \
file_cache.o \
hll.o \
lfc_prewarm.o \
libpagestore.o \
logical_replication_monitor.o \
neon.o \
@@ -69,7 +67,6 @@ WALPROP_OBJS = \
# libcommunicator.a is built by cargo from the Rust sources under communicator/
# subdirectory. `cargo build` also generates communicator_bindings.h.
communicator_new.o: communicator/communicator_bindings.h
communicator_process.o: communicator/communicator_bindings.h
file_cache.o: communicator/communicator_bindings.h

View File

@@ -17,36 +17,14 @@ rest_broker = []
[dependencies]
axum.workspace = true
bytes.workspace = true
clashmap.workspace = true
http.workspace = true
libc.workspace = true
nix.workspace = true
atomic_enum = "0.3.0"
measured.workspace = true
prometheus.workspace = true
prost.workspace = true
strum_macros.workspace = true
thiserror.workspace = true
tonic = { workspace = true, default-features = false, features=["codegen", "prost", "transport"] }
tokio = { workspace = true, features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] }
tokio-pipe = { version = "0.2.12" }
tracing.workspace = true
tracing-subscriber.workspace = true
uring-common = { workspace = true, features = ["bytes"] }
pageserver_client_grpc.workspace = true
pageserver_api.workspace = true
pageserver_page_api.workspace = true
neon-shmem.workspace = true
measured.workspace = true
utils.workspace = true
workspace_hack = { version = "0.1", path = "../../../workspace_hack" }
[target.'cfg(target_os = "linux")'.dependencies]
fiemap = "0.1.3"
itertools.workspace = true
[build-dependencies]
cbindgen.workspace = true

View File

@@ -3,18 +3,9 @@
This package provides the so-called "compute-pageserver communicator",
or just "communicator" in short. The communicator is a separate
background worker process that runs in the PostgreSQL server. It's
part of the neon extension.
The commuicator handles the communication with the pageservers, and
also provides an HTTP endpoint for metrics over a local Unix Domain
socket (aka. the "communicator control socket"). On the PostgreSQL
side, the glue code in pgxn/neon/ uses the communicator to implement
the PostgreSQL Storage Manager (SMGR) interface.
## Design criteria
- Low latency
- Saturate a 10 Gbit / s network interface without becoming a bottleneck
part of the neon extension. Currently, it only provides an HTTP
endpoint for metrics, but in the future it will evolve to handle all
communications with the pageservers.
## Source code view
@@ -23,122 +14,10 @@ pgxn/neon/communicator_process.c
the glue that interacts with PostgreSQL code and the Rust
code in the communicator process.
pgxn/neon/communicator_new.c
Contains the backend code that interacts with the communicator
process.
pgxn/neon/communicator/src/backend_interface.rs
The entry point for calls from each backend.
pgxn/neon/communicator/src/init.rs
Initialization at server startup
pgxn/neon/communicator/src/worker_process/
Worker process main loop and glue code
At compilation time, pgxn/neon/communicator/ produces a static
library, libcommunicator.a. It is linked to the neon.so extension
library.
The real networking code, which is independent of PostgreSQL, is in
the pageserver/client_grpc crate.
## Process view
The communicator runs in a dedicated background worker process, the
"communicator process". The communicator uses a multi-threaded Tokio
runtime to execute the IO requests. So the communicator process has
multiple threads running. That's unusual for Postgres processes and
care must be taken to make that work.
### Backend <-> worker communication
Each backend has a number of I/O request slots in shared memory. The
slots are statically allocated for each backend, and must not be
accessed by other backends. The worker process reads requests from the
shared memory slots, and writes responses back to the slots.
Here's an example snapshot of the system, when two requests from two
different backends are in progress:
```
Backends Request slots Communicator process
--------- ------------- --------------------
Backend 1 1: Idle
2: Idle
3: Processing tokio task handling request 3
Backend 2 4: Completed
5: Processing tokio task handling request 5
6: Idle
... ...
```
To submit an IO request, the backend first picks one of its Idle
slots, writes the IO request in the slot, and updates it to
'Submitted' state. That transfers the ownership of the slot to the
worker process, until the worker process marks the request as
Completed. The worker process spawns a separate Tokio task for each
request.
To inform the worker process that a request slot has a pending IO
request, there's a pipe shared by the worker process and all backend
processes. The backend writes the index of the request slot to the
pipe after changing the slot's state to Submitted. This wakes up the
worker process.
(Note that the pipe is just used for wakeups, but the worker process
is free to pick up Submitted IO requests even without receiving the
wakeup. As of this writing, it doesn't do that, but it might be useful
in the future to reduce latency even further, for example.)
When the worker process has completed processing the request, it
writes the result back in the request slot. A GetPage request can also
contain a pointer to buffer in the shared buffer cache. In that case,
the worker process writes the resulting page contents directly to the
buffer, and just a result code in the request slot. It then updates
the 'state' field to Completed, which passes the owner ship back to
the originating backend. Finally, it signals the process Latch of the
originating backend, waking it up.
### Differences between PostgreSQL v16, v17 and v18
PostgreSQL v18 introduced the new AIO mechanism. The PostgreSQL AIO
mechanism uses a very similar mechanism as described in the previous
section, for the communication between AIO worker processes and
backends. With our communicator, the AIO worker processes are not
used, but we use the same PgAioHandle request slots as in upstream.
For Neon-specific IO requests like GetDbSize, a neon request slot is
used. But for the actual IO requests, the request slot merely contains
a pointer to the PgAioHandle slot. The worker process updates the
status of that, calls the IO callbacks upon completionetc, just like
the upstream AIO worker processes do.
## Sequence diagram
neon
PostgreSQL extension backend_interface.rs worker_process.rs processor tonic
| . . . .
| smgr_read() . . . .
+-------------> + . . .
. | . . .
. | rcommunicator_ . . .
. | get_page_at_lsn . . .
. +------------------> + . .
| . .
| write request to . . .
| slot . .
| . .
| . .
| submit_request() . .
+-----------------> + .
| | .
| | db_size_request . .
+---------------->.
. TODO
### Compute <-> pageserver protocol
The protocol between Compute and the pageserver is based on gRPC. See `protos/`.

View File

@@ -1,224 +0,0 @@
//! This module implements a request/response "slot" for submitting
//! requests from backends to the communicator process.
//!
//! NB: The "backend" side of this code runs in Postgres backend processes,
//! which means that it is not safe to use the 'tracing' crate for logging, nor
//! to launch threads or use tokio tasks!
use std::cell::UnsafeCell;
use std::sync::atomic::{AtomicI32, Ordering};
use crate::neon_request::{NeonIORequest, NeonIOResult};
use atomic_enum::atomic_enum;
/// One request/response slot. Each backend has its own set of slots that it
/// uses.
///
/// This is the moral equivalent of PgAioHandle for Postgres AIO requests
/// Like PgAioHandle, try to keep this small.
///
/// There is an array of these in shared memory. Therefore, this must be Sized.
///
/// ## Lifecycle of a request
///
/// A slot is always owned by either the backend process or the communicator
/// process, depending on the 'state'. Only the owning process is allowed to
/// read or modify the slot, except for reading the 'state' itself to check who
/// owns it.
///
/// A slot begins in the Idle state, where it is owned by the backend process.
/// To submit a request, the backend process fills the slot with the request
/// data, and changes it to the Submitted state. After changing the state, the
/// slot is owned by the communicator process, and the backend is not allowed
/// to access it until the communicator process marks it as Completed.
///
/// When the communicator process sees that the slot is in Submitted state, it
/// starts to process the request. After processing the request, it stores the
/// result in the slot, and changes the state to Completed. It is now owned by
/// the backend process again, which may now read the result, and reuse the
/// slot for a new request.
///
/// For correctness of the above protocol, we really only need two states:
/// "owned by backend" and "owned by communicator process". But to help with
/// debugging and better assertions, there are a few more states. When the
/// backend starts to fill in the request details in the slot, it first sets the
/// state from Idle to Filling, and when it's done with that, from Filling to
/// Submitted. In the Filling state, the slot is still owned by the
/// backend. Similarly, when the communicator process starts to process a
/// request, it sets it to Processing state first, but the slot is still owned
/// by the communicator process.
///
/// This struct doesn't handle waking up the communicator process when a request
/// has been submitted or when a response is ready. The 'owner_procno' is used
/// for waking up the backend on completion, but that happens elsewhere.
pub struct NeonIORequestSlot {
/// similar to PgAioHandleState
state: AtomicNeonIORequestSlotState,
/// The owning process's ProcNumber. The worker process uses this to set the
/// process's latch on completion.
///
/// (This could be calculated from num_neon_request_slots_per_backend and
/// the index of this slot in the overall 'neon_requst_slots array'. But we
/// prefer the communicator process to not know how the request slots are
/// divided between the backends.)
owner_procno: AtomicI32,
/// SAFETY: This is modified by submit_request(), after it has established
/// ownership of the slot by setting state from Idle to Filling
request: UnsafeCell<NeonIORequest>,
/// Valid when state is Completed
///
/// SAFETY: This is modified by RequestProcessingGuard::complete(). There
/// can be only one RequestProcessingGuard outstanding for a slot at a time,
/// because it is returned by start_processing_request() which checks the
/// state, so RequestProcessingGuard has exclusive access to the slot.
result: UnsafeCell<NeonIOResult>,
}
// The protocol described in the "Lifecycle of a request" section above ensures
// the safe access to the fields
unsafe impl Send for NeonIORequestSlot {}
unsafe impl Sync for NeonIORequestSlot {}
impl Default for NeonIORequestSlot {
fn default() -> NeonIORequestSlot {
NeonIORequestSlot {
owner_procno: AtomicI32::new(-1),
request: UnsafeCell::new(NeonIORequest::Empty),
result: UnsafeCell::new(NeonIOResult::Empty),
state: AtomicNeonIORequestSlotState::new(NeonIORequestSlotState::Idle),
}
}
}
#[atomic_enum]
#[derive(Eq, PartialEq)]
pub enum NeonIORequestSlotState {
Idle,
/// Backend is filling in the request
Filling,
/// Backend has submitted the request to the communicator, but the
/// communicator process has not yet started processing it.
Submitted,
/// Communicator is processing the request
Processing,
/// Communicator has completed the request, and the 'result' field is now
/// valid, but the backend has not read the result yet.
Completed,
}
impl NeonIORequestSlot {
/// Write a request to the slot, and mark it as Submitted.
///
/// Note: This does not wake up the worker process to actually process
/// the request. It's the caller's responsibility to do that.
pub fn submit_request(&self, request: &NeonIORequest, proc_number: i32) {
// Verify that the slot is in Idle state previously, and put it in
// Filling state.
//
// XXX: This step isn't strictly necessary. Assuming the caller didn't
// screw up and try to use a slot that's already in use, we could fill
// the slot and switch it directly from Idle to Submitted state.
if let Err(s) = self.state.compare_exchange(
NeonIORequestSlotState::Idle,
NeonIORequestSlotState::Filling,
Ordering::Relaxed,
Ordering::Relaxed,
) {
panic!("unexpected state in request slot: {s:?}");
}
// Fill in the request details
self.owner_procno.store(proc_number, Ordering::Relaxed);
unsafe { *self.request.get() = *request }
// This synchronizes-with store/swap in [`start_processing_request`].
// Note that this ensures that the previous non-atomic writes visible
// to other threads too.
self.state
.store(NeonIORequestSlotState::Submitted, Ordering::Release);
}
pub fn get_state(&self) -> NeonIORequestSlotState {
self.state.load(Ordering::Relaxed)
}
pub fn try_get_result(&self) -> Option<NeonIOResult> {
// This synchronizes-with the store/swap in [`RequestProcessingGuard::completed`]
let state = self.state.load(Ordering::Acquire);
if state == NeonIORequestSlotState::Completed {
let result = unsafe { *self.result.get() };
self.state
.store(NeonIORequestSlotState::Idle, Ordering::Relaxed);
Some(result)
} else {
None
}
}
/// Read the IO request from the slot indicated in the wakeup
pub fn start_processing_request<'a>(&'a self) -> Option<RequestProcessingGuard<'a>> {
// XXX: using atomic load rather than compare_exchange would be
// sufficient here, as long as the communicator process has _some_ means
// of tracking which requests it's already processing. That could be a
// flag somewhere in communicator's private memory, for example.
//
// This synchronizes-with the store in [`submit_request`].
if let Err(s) = self.state.compare_exchange(
NeonIORequestSlotState::Submitted,
NeonIORequestSlotState::Processing,
Ordering::Acquire,
Ordering::Relaxed,
) {
// FIXME surprising state. This is unexpected at the moment, but if we
// started to process requests more aggressively, without waiting for the
// read from the pipe, then this could happen
panic!("unexpected state in request slot: {s:?}");
}
Some(RequestProcessingGuard(self))
}
}
/// [`NeonIORequestSlot::start_processing_request`] returns this guard object to
/// indicate that the the caller now "owns" the slot, until it calls
/// [`RequestProcessingGuard::completed`].
///
/// TODO: implement Drop on this, to mark the request as Aborted or Errored
/// if [`RequestProcessingGuard::completed`] is not called.
pub struct RequestProcessingGuard<'a>(&'a NeonIORequestSlot);
unsafe impl<'a> Send for RequestProcessingGuard<'a> {}
unsafe impl<'a> Sync for RequestProcessingGuard<'a> {}
impl<'a> RequestProcessingGuard<'a> {
pub fn get_request(&self) -> &NeonIORequest {
unsafe { &*self.0.request.get() }
}
pub fn get_owner_procno(&self) -> i32 {
self.0.owner_procno.load(Ordering::Relaxed)
}
pub fn completed(self, result: NeonIOResult) {
// Store the result to the slot.
unsafe {
*self.0.result.get() = result;
};
// Mark the request as completed. After that, we no longer have
// ownership of the slot, and must not modify it.
let old_state = self
.0
.state
.swap(NeonIORequestSlotState::Completed, Ordering::Release);
assert!(old_state == NeonIORequestSlotState::Processing);
}
}

View File

@@ -1,296 +0,0 @@
//! This code runs in each backend process. That means that launching Rust threads, panicking
//! etc. is forbidden!
use std::os::fd::OwnedFd;
use crate::backend_comms::NeonIORequestSlot;
use crate::init::CommunicatorInitStruct;
use crate::integrated_cache::{BackendCacheReadOp, IntegratedCacheReadAccess};
use crate::neon_request::{CCachedGetPageVResult, CLsn, COid};
use crate::neon_request::{NeonIORequest, NeonIOResult};
use utils::lsn::Lsn;
pub struct CommunicatorBackendStruct<'t> {
my_proc_number: i32,
neon_request_slots: &'t [NeonIORequestSlot],
submission_pipe_write_fd: OwnedFd,
pending_cache_read_op: Option<BackendCacheReadOp<'t>>,
integrated_cache: &'t IntegratedCacheReadAccess<'t>,
}
#[unsafe(no_mangle)]
pub extern "C" fn rcommunicator_backend_init(
cis: Box<CommunicatorInitStruct>,
my_proc_number: i32,
) -> &'static mut CommunicatorBackendStruct<'static> {
if my_proc_number < 0 {
panic!("cannot attach to communicator shared memory with procnumber {my_proc_number}");
}
let integrated_cache = Box::leak(Box::new(cis.integrated_cache_init_struct.backend_init()));
let bs: &'static mut CommunicatorBackendStruct =
Box::leak(Box::new(CommunicatorBackendStruct {
my_proc_number,
neon_request_slots: cis.neon_request_slots,
submission_pipe_write_fd: cis.submission_pipe_write_fd,
pending_cache_read_op: None,
integrated_cache,
}));
bs
}
/// Start a request. You can poll for its completion and get the result by
/// calling bcomm_poll_dbsize_request_completion(). The communicator will wake
/// us up by setting our process latch, so to wait for the completion, wait on
/// the latch and call bcomm_poll_dbsize_request_completion() every time the
/// latch is set.
///
/// Safety: The C caller must ensure that the references are valid.
/// The requested slot must be free, or this panics.
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_start_io_request(
bs: &'_ mut CommunicatorBackendStruct,
slot_idx: i32,
request: &NeonIORequest,
immediate_result_ptr: &mut NeonIOResult,
) -> i32 {
assert!(bs.pending_cache_read_op.is_none());
// Check if the request can be satisfied from the cache first
if let NeonIORequest::RelSize(req) = request {
if let Some(nblocks) = bs.integrated_cache.get_rel_size(&req.reltag()) {
*immediate_result_ptr = NeonIOResult::RelSize(nblocks);
return -1;
}
}
// Create neon request and submit it
bs.start_neon_io_request(slot_idx, request);
slot_idx
}
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_start_get_page_v_request(
bs: &mut CommunicatorBackendStruct,
slot_idx: i32,
request: &NeonIORequest,
immediate_result_ptr: &mut CCachedGetPageVResult,
) -> i32 {
let NeonIORequest::GetPageV(get_pagev_request) = request else {
panic!("invalid request passed to bcomm_start_get_page_v_request()");
};
assert!(matches!(request, NeonIORequest::GetPageV(_)));
assert!(bs.pending_cache_read_op.is_none());
// Check if the request can be satisfied from the cache first
let mut all_cached = true;
let mut read_op = bs.integrated_cache.start_read_op();
for i in 0..get_pagev_request.nblocks {
if let Some(cache_block) = read_op.get_page(
&get_pagev_request.reltag(),
get_pagev_request.block_number + i as u32,
) {
immediate_result_ptr.cache_block_numbers[i as usize] = cache_block;
} else {
// not found in cache
all_cached = false;
break;
}
}
if all_cached {
bs.pending_cache_read_op = Some(read_op);
return -1;
}
// Create neon request and submit it
bs.start_neon_io_request(slot_idx, request);
slot_idx
}
/// Check if a request has completed. Returns:
///
/// -1 if the request is still being processed
/// 0 on success
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_poll_request_completion(
bs: &mut CommunicatorBackendStruct,
request_slot_idx: u32,
result_p: &mut NeonIOResult,
) -> i32 {
match bs.neon_request_slots[request_slot_idx as usize].try_get_result() {
None => -1, // still processing
Some(result) => {
*result_p = result;
0
}
}
}
/// Check if a request has completed. Returns:
///
/// 'false' if the slot is Idle. The backend process has ownership.
/// 'true' if the slot is busy, and should be polled for result.
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_get_request_slot_status(
bs: &mut CommunicatorBackendStruct,
request_slot_idx: u32,
) -> bool {
use crate::backend_comms::NeonIORequestSlotState;
match bs.neon_request_slots[request_slot_idx as usize].get_state() {
NeonIORequestSlotState::Idle => false,
NeonIORequestSlotState::Filling => {
// 'false' would be the right result here. However, this
// is a very transient state. The C code should never
// leave a slot in this state, so if it sees that,
// something's gone wrong and it's not clear what to do
// with it.
panic!("unexpected Filling state in request slot {request_slot_idx}");
}
NeonIORequestSlotState::Submitted => true,
NeonIORequestSlotState::Processing => true,
NeonIORequestSlotState::Completed => true,
}
}
// LFC functions
/// Finish a local file cache read
///
//
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_finish_cache_read(bs: &mut CommunicatorBackendStruct) -> bool {
if let Some(op) = bs.pending_cache_read_op.take() {
op.finish()
} else {
panic!("bcomm_finish_cache_read() called with no cached read pending");
}
}
/// Check if LFC contains the given buffer, and update its last-written LSN if not.
///
/// This is used in WAL replay in read replica, to skip updating pages that are
/// not in cache.
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_update_lw_lsn_for_block_if_not_cached(
bs: &mut CommunicatorBackendStruct,
spc_oid: COid,
db_oid: COid,
rel_number: u32,
fork_number: u8,
block_number: u32,
lsn: CLsn,
) -> bool {
bs.integrated_cache.update_lw_lsn_for_block_if_not_cached(
&pageserver_page_api::RelTag {
spcnode: spc_oid,
dbnode: db_oid,
relnode: rel_number,
forknum: fork_number,
},
block_number,
Lsn(lsn),
)
}
#[repr(C)]
#[derive(Clone, Debug)]
pub struct FileCacheIterator {
next_bucket: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
}
/// Iterate over LFC contents
#[allow(clippy::missing_safety_doc)]
#[unsafe(no_mangle)]
pub unsafe extern "C" fn bcomm_cache_iterate_begin(
_bs: &mut CommunicatorBackendStruct,
iter: *mut FileCacheIterator,
) {
unsafe { (*iter).next_bucket = 0 };
}
#[allow(clippy::missing_safety_doc)]
#[unsafe(no_mangle)]
pub unsafe extern "C" fn bcomm_cache_iterate_next(
bs: &mut CommunicatorBackendStruct,
iter: *mut FileCacheIterator,
) -> bool {
use crate::integrated_cache::GetBucketResult;
loop {
let next_bucket = unsafe { (*iter).next_bucket } as usize;
match bs.integrated_cache.get_bucket(next_bucket) {
GetBucketResult::Occupied(rel, blk) => {
unsafe {
(*iter).spc_oid = rel.spcnode;
(*iter).db_oid = rel.dbnode;
(*iter).rel_number = rel.relnode;
(*iter).fork_number = rel.forknum;
(*iter).block_number = blk;
(*iter).next_bucket += 1;
}
break true;
}
GetBucketResult::Vacant => {
unsafe {
(*iter).next_bucket += 1;
}
continue;
}
GetBucketResult::OutOfBounds => {
break false;
}
}
}
}
#[allow(clippy::missing_safety_doc)]
#[unsafe(no_mangle)]
pub unsafe extern "C" fn bcomm_cache_get_num_pages_used(bs: &mut CommunicatorBackendStruct) -> u64 {
bs.integrated_cache.get_num_buckets_in_use() as u64
}
impl<'t> CommunicatorBackendStruct<'t> {
/// The slot must be free, or this panics.
pub(crate) fn start_neon_io_request(&mut self, request_slot_idx: i32, request: &NeonIORequest) {
let my_proc_number = self.my_proc_number;
self.neon_request_slots[request_slot_idx as usize].submit_request(request, my_proc_number);
// Tell the communicator about it
self.notify_about_request(request_slot_idx);
}
/// Send a wakeup to the communicator process
fn notify_about_request(self: &CommunicatorBackendStruct<'t>, request_slot_idx: i32) {
// wake up communicator by writing the idx to the submission pipe
//
// This can block, if the pipe is full. That should be very rare,
// because the communicator tries hard to drain the pipe to prevent
// that. Also, there's a natural upper bound on how many wakeups can be
// queued up: there is only a limited number of request slots for each
// backend.
//
// If it does block very briefly, that's not too serious.
let idxbuf = request_slot_idx.to_ne_bytes();
let _res = nix::unistd::write(&self.submission_pipe_write_fd, &idxbuf);
// FIXME: check result, return any errors
}
}

View File

@@ -1,299 +0,0 @@
//! Implement the "low-level" parts of the file cache.
//!
//! This module just deals with reading and writing the file, and keeping track
//! which blocks in the cache file are in use and which are free. The "high
//! level" parts of tracking which block in the cache file corresponds to which
//! relation block is handled in 'integrated_cache' instead.
//!
//! This module is only used to access the file from the communicator
//! process. The backend processes *also* read the file (and sometimes also
//! write it? ), but the backends use direct C library calls for that.
use std::fs::File;
use std::os::unix::fs::FileExt;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::sync::Mutex;
use std::sync::atomic::{AtomicU64, Ordering};
use measured::metric;
use measured::metric::MetricEncoding;
use measured::metric::gauge::GaugeState;
use measured::{Gauge, MetricGroup};
use crate::BLCKSZ;
use tokio::task::spawn_blocking;
pub type CacheBlock = u64;
pub const INVALID_CACHE_BLOCK: CacheBlock = u64::MAX;
pub struct FileCache {
file: Arc<File>,
free_list: Mutex<FreeList>,
/// The true number of writable blocks within the LFC file.
///
/// The `max_blocks` field of `FreeList` isn't sufficient once holes are
/// punched, so we track this manually.
pub size: AtomicU64,
/// The path to the LFC file.
///
/// The `fiemap-rs` library doesn't expose any way to issue a FIEMAP ioctl
/// on an existing file descriptor, so we have to save the path.
path: PathBuf,
metrics: FileCacheMetricGroup,
}
#[derive(MetricGroup)]
#[metric(new())]
struct FileCacheMetricGroup {
/// Local File Cache size in 8KiB blocks
max_blocks: Gauge,
/// Number of free 8KiB blocks in Local File Cache
num_free_blocks: Gauge,
}
// TODO: We keep track of all free blocks in this vec. That doesn't really scale.
// Idea: when free_blocks fills up with more than 1024 entries, write them all to
// one block on disk.
#[derive(Debug)]
struct FreeList {
next_free_block: CacheBlock,
max_blocks: u64,
free_blocks: Vec<CacheBlock>,
}
impl FileCache {
pub fn new(file_cache_path: &Path, mut initial_size: u64) -> Result<FileCache, std::io::Error> {
if initial_size < 100 {
tracing::warn!(
"min size for file cache is 100 blocks, {} requested",
initial_size
);
initial_size = 100;
}
let file = std::fs::OpenOptions::new()
.read(true)
.write(true)
.truncate(true)
.create(true)
.open(file_cache_path)?;
Ok(FileCache {
file: Arc::new(file),
size: initial_size.into(),
free_list: Mutex::new(FreeList {
next_free_block: 0,
max_blocks: initial_size,
free_blocks: Vec::new(),
}),
path: file_cache_path.to_path_buf(),
metrics: FileCacheMetricGroup::new(),
})
}
pub async fn read_block(
&self,
cache_block: CacheBlock,
mut dst: impl uring_common::buf::IoBufMut + Send + Sync,
) -> Result<(), std::io::Error> {
assert!(dst.bytes_total() == BLCKSZ);
let file = self.file.clone();
let dst_ref = unsafe { std::slice::from_raw_parts_mut(dst.stable_mut_ptr(), BLCKSZ) };
spawn_blocking(move || file.read_exact_at(dst_ref, cache_block * BLCKSZ as u64)).await??;
Ok(())
}
pub async fn write_block(
&self,
cache_block: CacheBlock,
src: impl uring_common::buf::IoBuf + Send + Sync,
) -> Result<(), std::io::Error> {
assert!(src.bytes_init() == BLCKSZ);
let file = self.file.clone();
let src_ref = unsafe { std::slice::from_raw_parts(src.stable_ptr(), BLCKSZ) };
spawn_blocking(move || file.write_all_at(src_ref, cache_block * BLCKSZ as u64)).await??;
Ok(())
}
/// Allocate a block within the LFC file for use by the LFC.
pub fn alloc_block(&self) -> Option<CacheBlock> {
let mut free_list = self.free_list.lock().unwrap();
if let Some(x) = free_list.free_blocks.pop() {
return Some(x);
}
if free_list.next_free_block < free_list.max_blocks {
let result = free_list.next_free_block;
free_list.next_free_block += 1;
return Some(result);
}
None
}
/// Mark a block used by the LFC as free for allocation.
pub fn dealloc_block(&self, cache_block: CacheBlock) {
let mut free_list = self.free_list.lock().unwrap();
free_list.free_blocks.push(cache_block);
}
/// Attempt to let the filesystem reclaim `num_blocks` of free blocks within the LFC.
/// Returns the number of blocks not reclaimed by the filesystem.
pub fn reclaim_blocks(&self, num_blocks: u64) -> u64 {
let mut free_list = self.free_list.lock().unwrap();
// Try to limit the maximum first so that we can shrink without doing any I/O
let unused_space = num_blocks.min(free_list.max_blocks - free_list.next_free_block);
free_list.max_blocks -= unused_space;
self.size.fetch_sub(unused_space, Ordering::Relaxed);
let punched = (free_list.free_blocks.len() as u64).min(num_blocks - unused_space);
for _ in 0..punched {
self.punch_block(free_list.free_blocks.pop().unwrap());
}
num_blocks - unused_space - punched
}
// "Delete" a block via fallocate's hole punching feature.
// TODO(quantumish): possibly implement some batching? lots of syscalls...
// unfortunately should be at odds with our access pattern as entries in the hashmap
// should have no correlation with the location of blocks in the actual LFC file.
/// "Un-punch" a block by re-allocating it with `fallocate` and update LFC size.
fn unpunch_block(&self, block: CacheBlock) {
use nix::fcntl as nix;
self.size.fetch_add(1, Ordering::Relaxed);
if let Err(e) = nix::fallocate(
self.file.clone(),
nix::FallocateFlags::FALLOC_FL_ZERO_RANGE
.union(nix::FallocateFlags::FALLOC_FL_KEEP_SIZE),
(block as usize * BLCKSZ) as libc::off_t,
BLCKSZ as libc::off_t
) {
tracing::error!("failed to un-punch hole in LFC at {block}: {e}");
return;
}
}
/// "Punch" a block out of the LFC file by using `fallocate` and update LFC size.
pub fn punch_block(&self, block: CacheBlock) {
use nix::fcntl as nix;
self.size.fetch_sub(1, Ordering::Relaxed);
if let Err(e) = nix::fallocate(
self.file.clone(),
nix::FallocateFlags::FALLOC_FL_PUNCH_HOLE
.union(nix::FallocateFlags::FALLOC_FL_KEEP_SIZE),
(block as usize * BLCKSZ) as libc::off_t,
BLCKSZ as libc::off_t
) {
tracing::error!("failed to punch hole in LFC at {block}: {e}");
return;
}
}
/// Attempt to unpunch `num_blocks` of previously hole-punched blocks.
///
/// Similarly named to `unpunch_block` but does not punch a series of blocks in a row.
/// Instead uses FIEMAP ioctl to locate holes in the file and unpunch them!
#[cfg(target_os = "linux")]
pub fn unpunch_blocks(&self, num_blocks: u64) -> u64 {
use itertools::Itertools;
let mut pushed = 0;
let mut free_list = self.free_list.lock().unwrap();
let res = fiemap::fiemap(self.path.as_path()).unwrap();
for (prev, cur) in res.map(|x| x.unwrap()).tuple_windows() {
if (prev.fe_logical + prev.fe_length) < cur.fe_logical {
let mut end = prev.fe_logical + prev.fe_length;
while end < cur.fe_logical {
free_list.free_blocks.push(end / BLCKSZ as u64);
self.unpunch_block(end / BLCKSZ as u64);
pushed += 1;
if pushed == num_blocks {
return 0;
}
end += BLCKSZ as u64;
}
}
}
num_blocks - pushed
}
/// Attempt to unpunch `num_blocks` of previously hole-punched blocks.
///
/// Much more expensive than the Linux variant as each hole must be located
/// by a separate call to `lseek`.
///
/// FIXME: Sometimes this function provides inaccurate counts of the number
/// of holes within a file. Whether we need this function at all is unclear,
/// as seemingly this part of the codebase only targets a system with ext4?
#[cfg(target_os = "macos")]
pub fn unpunch_blocks(&self, num_blocks: u64) -> u64 {
use nix::unistd as nix;
let mut free_list = self.free_list.lock().unwrap();
let num_bytes = (free_list.next_free_block * BLOCKSZ) as i64;
let mut cur_pos = 0;
let mut pushed = 0;
while cur_pos < num_bytes {
let res = nix::lseek(
file.clone(),
cur_pos,
nix::Whence::SeekHole
).unwrap();
if res >= num_bytes {
break;
}
self.unpunch_block(res);
free_list.free_blocks.push(res);
pushed += 1;
if pushed == num_blocks {
return 0;
}
cur_pos = res + BLOCKSZ as i64;
}
num_blocks - pushed
}
/// Physically grows the file and expands the freelist.
pub fn grow(&self, num_blocks: u64) {
let mut free_list = self.free_list.lock().unwrap();
self.size.fetch_add(num_blocks, Ordering::Relaxed);
free_list.max_blocks += num_blocks;
}
/// Returns number of blocks in the remaining space.
pub fn free_space(&self) -> u64 {
let free_list = self.free_list.lock().unwrap();
let slab = free_list.max_blocks - free_list.next_free_block.min(free_list.max_blocks);
let fragments = free_list.free_blocks.len() as u64;
slab + fragments
}
}
impl<T: metric::group::Encoding> MetricGroup<T> for FileCache
where
GaugeState: MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), <T as metric::group::Encoding>::Err> {
// Update the gauges with fresh values first
{
let free_list = self.free_list.lock().unwrap();
self.metrics.max_blocks.set(free_list.max_blocks as i64);
let total_free_blocks: i64 = free_list.free_blocks.len() as i64
+ (free_list.max_blocks as i64 - free_list.next_free_block as i64);
self.metrics.num_free_blocks.set(total_free_blocks);
}
self.metrics.collect_group_into(enc)
}
}

View File

@@ -1,107 +0,0 @@
//! Global allocator, for tracking memory usage of the Rust parts
//!
//! Postgres is designed to handle allocation failure (ie. malloc() returning NULL) gracefully. It
//! rolls backs the transaction and gives the user an "ERROR: out of memory" error. Rust code
//! however panics if an allocation fails. We don't want that to ever happen, because an unhandled
//! panic leads to Postgres crash and restart. Our strategy is to pre-allocate a large enough chunk
//! of memory for use by the Rust code, so that the allocations never fail.
//!
//! To pick the size for the pre-allocated chunk, we have a metric to track the high watermark
//! memory usage of all the Rust allocations in total.
//!
//! TODO:
//!
//! - Currently we just export the metrics. Actual allocations are still just passed through to
//! the system allocator.
//! - Take padding etc. overhead into account
use std::alloc::{GlobalAlloc, Layout, System};
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
use measured::metric;
use measured::metric::MetricEncoding;
use measured::metric::gauge::GaugeState;
use measured::{Gauge, MetricGroup};
pub(crate) struct MyAllocator {
allocations: AtomicU64,
deallocations: AtomicU64,
allocated: AtomicUsize,
high: AtomicUsize,
}
#[derive(MetricGroup)]
#[metric(new())]
struct MyAllocatorMetricGroup {
/// Number of allocations in Rust code
communicator_mem_allocations: Gauge,
/// Number of deallocations in Rust code
communicator_mem_deallocations: Gauge,
/// Bytes currently allocated
communicator_mem_allocated: Gauge,
/// High watermark of allocated bytes
communicator_mem_high: Gauge,
}
unsafe impl GlobalAlloc for MyAllocator {
unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
self.allocations.fetch_add(1, Ordering::Relaxed);
let mut allocated = self.allocated.fetch_add(layout.size(), Ordering::Relaxed);
allocated += layout.size();
self.high.fetch_max(allocated, Ordering::Relaxed);
unsafe { System.alloc(layout) }
}
unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
self.deallocations.fetch_add(1, Ordering::Relaxed);
self.allocated.fetch_sub(layout.size(), Ordering::Relaxed);
unsafe { System.dealloc(ptr, layout) }
}
}
#[global_allocator]
static GLOBAL: MyAllocator = MyAllocator {
allocations: AtomicU64::new(0),
deallocations: AtomicU64::new(0),
allocated: AtomicUsize::new(0),
high: AtomicUsize::new(0),
};
pub(crate) struct MyAllocatorCollector {
metrics: MyAllocatorMetricGroup,
}
impl MyAllocatorCollector {
pub(crate) fn new() -> Self {
Self {
metrics: MyAllocatorMetricGroup::new(),
}
}
}
impl<T: metric::group::Encoding> MetricGroup<T> for MyAllocatorCollector
where
GaugeState: MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), <T as metric::group::Encoding>::Err> {
// Update the gauges with fresh values first
self.metrics
.communicator_mem_allocations
.set(GLOBAL.allocations.load(Ordering::Relaxed) as i64);
self.metrics
.communicator_mem_deallocations
.set(GLOBAL.allocations.load(Ordering::Relaxed) as i64);
self.metrics
.communicator_mem_allocated
.set(GLOBAL.allocated.load(Ordering::Relaxed) as i64);
self.metrics
.communicator_mem_high
.set(GLOBAL.high.load(Ordering::Relaxed) as i64);
self.metrics.collect_group_into(enc)
}
}

View File

@@ -1,166 +0,0 @@
//! Initialization functions. These are executed in the postmaster process,
//! at different stages of server startup.
//!
//!
//! Communicator initialization steps:
//!
//! 1. At postmaster startup, before shared memory is allocated,
//! rcommunicator_shmem_size() is called to get the amount of
//! shared memory that this module needs.
//!
//! 2. Later, after the shared memory has been allocated,
//! rcommunicator_shmem_init() is called to initialize the shmem
//! area.
//!
//! Per process initialization:
//!
//! When a backend process starts up, it calls rcommunicator_backend_init().
//! In the communicator worker process, other functions are called, see
//! `worker_process` module.
use std::ffi::c_int;
use std::mem;
use std::mem::MaybeUninit;
use std::os::fd::OwnedFd;
use crate::backend_comms::NeonIORequestSlot;
use crate::integrated_cache::IntegratedCacheInitStruct;
/// This struct is created in the postmaster process, and inherited to
/// the communicator process and all backend processes through fork()
#[repr(C)]
pub struct CommunicatorInitStruct {
pub submission_pipe_read_fd: OwnedFd,
pub submission_pipe_write_fd: OwnedFd,
// Shared memory data structures
pub num_neon_request_slots: u32,
pub neon_request_slots: &'static [NeonIORequestSlot],
pub integrated_cache_init_struct: IntegratedCacheInitStruct<'static>,
}
impl std::fmt::Debug for CommunicatorInitStruct {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
fmt.debug_struct("CommunicatorInitStruct")
.field("submission_pipe_read_fd", &self.submission_pipe_read_fd)
.field("submission_pipe_write_fd", &self.submission_pipe_write_fd)
.field("num_neon_request_slots", &self.num_neon_request_slots)
.field("neon_request_slots length", &self.neon_request_slots.len())
.finish()
}
}
#[unsafe(no_mangle)]
pub extern "C" fn rcommunicator_shmem_size(num_neon_request_slots: u32) -> u64 {
let mut size = 0;
size += mem::size_of::<NeonIORequestSlot>() * num_neon_request_slots as usize;
// For integrated_cache's Allocator. TODO: make this adjustable
size += IntegratedCacheInitStruct::shmem_size();
size as u64
}
/// Initialize the shared memory segment. Returns a backend-private
/// struct, which will be inherited by backend processes through fork
#[unsafe(no_mangle)]
pub extern "C" fn rcommunicator_shmem_init(
submission_pipe_read_fd: c_int,
submission_pipe_write_fd: c_int,
num_neon_request_slots: u32,
shmem_area_ptr: *mut MaybeUninit<u8>,
shmem_area_len: u64,
initial_file_cache_size: u64,
max_file_cache_size: u64,
) -> &'static mut CommunicatorInitStruct {
let shmem_area: &'static mut [MaybeUninit<u8>] =
unsafe { std::slice::from_raw_parts_mut(shmem_area_ptr, shmem_area_len as usize) };
let (neon_request_slots, remaining_area) =
alloc_array_from_slice::<NeonIORequestSlot>(shmem_area, num_neon_request_slots as usize);
for slot in neon_request_slots.iter_mut() {
slot.write(NeonIORequestSlot::default());
}
// 'neon_request_slots' is initialized now. (MaybeUninit::slice_assume_init_mut() is nightly-only
// as of this writing.)
let neon_request_slots = unsafe {
std::mem::transmute::<&mut [MaybeUninit<NeonIORequestSlot>], &mut [NeonIORequestSlot]>(
neon_request_slots,
)
};
// Give the rest of the area to the integrated cache
let integrated_cache_init_struct = IntegratedCacheInitStruct::shmem_init(
remaining_area,
initial_file_cache_size,
max_file_cache_size,
);
let (submission_pipe_read_fd, submission_pipe_write_fd) = unsafe {
use std::os::fd::FromRawFd;
(
OwnedFd::from_raw_fd(submission_pipe_read_fd),
OwnedFd::from_raw_fd(submission_pipe_write_fd),
)
};
let cis: &'static mut CommunicatorInitStruct = Box::leak(Box::new(CommunicatorInitStruct {
submission_pipe_read_fd,
submission_pipe_write_fd,
num_neon_request_slots,
neon_request_slots,
integrated_cache_init_struct,
}));
cis
}
pub fn alloc_from_slice<T>(
area: &mut [MaybeUninit<u8>],
) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
let layout = std::alloc::Layout::new::<T>();
let area_start = area.as_mut_ptr();
// pad to satisfy alignment requirements
let padding = area_start.align_offset(layout.align());
if padding + layout.size() > area.len() {
panic!("out of memory");
}
let area = &mut area[padding..];
let (result_area, remain) = area.split_at_mut(layout.size());
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
let result = unsafe { result_ptr.as_mut().unwrap() };
(result, remain)
}
pub fn alloc_array_from_slice<T>(
area: &mut [MaybeUninit<u8>],
len: usize,
) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
let layout = std::alloc::Layout::new::<T>();
let area_start = area.as_mut_ptr();
// pad to satisfy alignment requirements
let padding = area_start.align_offset(layout.align());
if padding + layout.size() * len > area.len() {
panic!("out of memory");
}
let area = &mut area[padding..];
let (result_area, remain) = area.split_at_mut(layout.size() * len);
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
(result, remain)
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,29 +1,5 @@
//! Three main parts:
//! - async tokio communicator core, which receives requests and processes them.
//! - Main loop and requests queues, which routes requests from backends to the core
//! - the per-backend glue code, which submits requests
mod backend_comms;
// mark this 'pub', because these functions are called from C code. Otherwise, the compiler
// complains about a bunch of structs and enum variants being unused, because it thinkgs
// the functions that use them are never called. There are some C-callable functions in
// other modules too, but marking this as pub is currently enough to silence the warnings
//
// TODO: perhaps collect *all* the extern "C" functions to one module?
pub mod backend_interface;
mod file_cache;
mod init;
mod integrated_cache;
mod neon_request;
mod worker_process;
mod global_allocator;
/// Name of the Unix Domain Socket that serves the metrics, and other APIs in the
/// future. This is within the Postgres data directory.
const NEON_COMMUNICATOR_SOCKET_NAME: &str = "neon-communicator.socket";
// FIXME: get this from postgres headers somehow
pub const BLCKSZ: usize = 8192;

View File

@@ -1,466 +0,0 @@
// Definitions of some core PostgreSQL datatypes.
/// XLogRecPtr is defined in "access/xlogdefs.h" as:
///
/// ```
/// typedef uint64 XLogRecPtr;
/// ```
/// cbindgen:no-export
pub type XLogRecPtr = u64;
pub type CLsn = XLogRecPtr;
pub type COid = u32;
// This conveniently matches PG_IOV_MAX
pub const MAX_GETPAGEV_PAGES: usize = 32;
pub const INVALID_BLOCK_NUMBER: u32 = u32::MAX;
use std::ffi::CStr;
use pageserver_page_api::{self as page_api, SlruKind};
/// Request from a Postgres backend to the communicator process
#[allow(clippy::large_enum_variant)]
#[repr(C)]
#[derive(Copy, Clone, Debug, strum_macros::EnumDiscriminants)]
#[strum_discriminants(derive(measured::FixedCardinalityLabel))]
pub enum NeonIORequest {
Empty,
// Read requests. These are C-friendly variants of the corresponding structs in
// pageserver_page_api.
RelSize(CRelSizeRequest),
GetPageV(CGetPageVRequest),
ReadSlruSegment(CReadSlruSegmentRequest),
PrefetchV(CPrefetchVRequest),
DbSize(CDbSizeRequest),
/// This is like GetPageV, but bypasses the LFC and allows specifiying the
/// request LSNs directly. For debugging purposes only.
GetPageVUncached(CGetPageVUncachedRequest),
// Write requests. These are needed to keep the relation size cache and LFC up-to-date.
// They are not sent to the pageserver.
WritePage(CWritePageRequest),
RelExtend(CRelExtendRequest),
RelZeroExtend(CRelZeroExtendRequest),
RelCreate(CRelCreateRequest),
RelTruncate(CRelTruncateRequest),
RelUnlink(CRelUnlinkRequest),
// Other requests
UpdateCachedRelSize(CUpdateCachedRelSizeRequest),
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub enum NeonIOResult {
Empty,
/// InvalidBlockNumber == 0xffffffff means "rel does not exist"
RelSize(u32),
/// the result pages are written to the shared memory addresses given in the request
GetPageV,
/// The result is written to the file, path to which is provided
/// in the request. The [`u64`] value here is the number of blocks.
ReadSlruSegment(u64),
/// A prefetch request returns as soon as the request has been received by the communicator.
/// It is processed in the background.
PrefetchVLaunched,
DbSize(u64),
// FIXME design compact error codes. Can't easily pass a string or other dynamic data.
// currently, this is 'errno'
Error(i32),
Aborted,
/// used for all write requests
WriteOK,
}
impl NeonIORequest {
/// All requests include a unique request ID, which can be used to trace the execution
/// of a request all the way to the pageservers. The request ID needs to be unique
/// within the lifetime of the Postgres instance (but not across servers or across
/// restarts of the same server).
pub fn request_id(&self) -> u64 {
use NeonIORequest::*;
match self {
Empty => 0,
RelSize(req) => req.request_id,
GetPageV(req) => req.request_id,
GetPageVUncached(req) => req.request_id,
ReadSlruSegment(req) => req.request_id,
PrefetchV(req) => req.request_id,
DbSize(req) => req.request_id,
WritePage(req) => req.request_id,
RelExtend(req) => req.request_id,
RelZeroExtend(req) => req.request_id,
RelCreate(req) => req.request_id,
RelTruncate(req) => req.request_id,
RelUnlink(req) => req.request_id,
UpdateCachedRelSize(req) => req.request_id,
}
}
}
/// Special quick result to a CGetPageVRequest request, indicating that the
/// the requested pages are present in the local file cache. The backend can
/// read the blocks directly from the given LFC blocks.
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CCachedGetPageVResult {
pub cache_block_numbers: [u64; MAX_GETPAGEV_PAGES],
}
/// ShmemBuf represents a buffer in shared memory.
///
/// SAFETY: The pointer must point to an area in shared memory. The functions allow you to liberally
/// get a mutable pointer to the contents; it is the caller's responsibility to ensure that you
/// don't access a buffer that's you're not allowed to. Inappropriate access to the buffer doesn't
/// violate Rust's safety semantics, but it will mess up and crash Postgres.
///
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct ShmemBuf {
// Pointer to where the result is written or where to read from. Must point into a buffer in shared memory!
pub ptr: *mut u8,
}
unsafe impl Send for ShmemBuf {}
unsafe impl Sync for ShmemBuf {}
unsafe impl uring_common::buf::IoBuf for ShmemBuf {
fn stable_ptr(&self) -> *const u8 {
self.ptr
}
fn bytes_init(&self) -> usize {
crate::BLCKSZ
}
fn bytes_total(&self) -> usize {
crate::BLCKSZ
}
}
unsafe impl uring_common::buf::IoBufMut for ShmemBuf {
fn stable_mut_ptr(&mut self) -> *mut u8 {
self.ptr
}
unsafe fn set_init(&mut self, pos: usize) {
if pos > crate::BLCKSZ {
panic!(
"set_init called past end of buffer, pos {}, buffer size {}",
pos,
crate::BLCKSZ
);
}
}
}
impl ShmemBuf {
pub fn as_mut_ptr(&self) -> *mut u8 {
self.ptr
}
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelSizeRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub allow_missing: bool,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CGetPageVRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u8,
// These fields define where the result is written. Must point into a buffer in shared memory!
pub dest: [ShmemBuf; MAX_GETPAGEV_PAGES],
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CGetPageVUncachedRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u8,
pub request_lsn: CLsn,
pub not_modified_since: CLsn,
// These fields define where the result is written. Must point into a buffer in shared memory!
pub dest: [ShmemBuf; MAX_GETPAGEV_PAGES],
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CReadSlruSegmentRequest {
pub request_id: u64,
pub slru_kind: SlruKind,
pub segment_number: u32,
pub request_lsn: CLsn,
/// Must be a null-terminated C string containing the file path
/// where the communicator will write the SLRU segment.
pub destination_file_path: ShmemBuf,
}
impl CReadSlruSegmentRequest {
/// Returns the file path where the communicator will write the
/// SLRU segment.
pub(crate) fn destination_file_path(&self) -> String {
unsafe { CStr::from_ptr(self.destination_file_path.as_mut_ptr() as *const _) }
.to_string_lossy()
.into_owned()
}
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CPrefetchVRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u8,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CDbSizeRequest {
pub request_id: u64,
pub db_oid: COid,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CWritePageRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub lsn: CLsn,
// `src` defines the new page contents. Must point into a buffer in shared memory!
pub src: ShmemBuf,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelExtendRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub lsn: CLsn,
// `src` defines the new page contents. Must point into a buffer in shared memory!
pub src: ShmemBuf,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelZeroExtendRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u32,
pub lsn: CLsn,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelCreateRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub lsn: CLsn,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelTruncateRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub nblocks: u32,
pub lsn: CLsn,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelUnlinkRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub lsn: CLsn,
}
impl CRelSizeRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CGetPageVRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CGetPageVUncachedRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CPrefetchVRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CWritePageRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelExtendRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelZeroExtendRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelCreateRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelTruncateRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelUnlinkRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CUpdateCachedRelSizeRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub nblocks: u32,
pub lsn: CLsn,
}
impl CUpdateCachedRelSizeRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}

View File

@@ -4,13 +4,10 @@
//!
//! These are called from the communicator threads! Careful what you do, most Postgres
//! functions are not safe to call in that context.
use utils::lsn::Lsn;
#[cfg(not(test))]
unsafe extern "C" {
pub fn notify_proc_unsafe(procno: std::ffi::c_int);
pub fn callback_set_my_latch_unsafe();
pub fn callback_get_request_lsn_unsafe() -> crate::neon_request::CLsn;
pub fn callback_get_lfc_metrics_unsafe() -> LfcMetrics;
}
@@ -19,36 +16,20 @@ unsafe extern "C" {
// package, but the code coverage build still builds these and tries to link with the
// external C code.)
#[cfg(test)]
unsafe fn notify_proc_unsafe(_procno: std::ffi::c_int) {
panic!("not usable in unit tests");
}
#[cfg(test)]
unsafe fn callback_set_my_latch_unsafe() {
panic!("not usable in unit tests");
}
#[cfg(test)]
unsafe fn callback_get_request_lsn_unsafe() -> crate::neon_request::CLsn {
panic!("not usable in unit tests");
}
#[cfg(test)]
unsafe fn callback_get_lfc_metrics_unsafe() -> LfcMetrics {
panic!("not usable in unit tests");
}
// safe wrappers
pub(super) fn notify_proc(procno: std::ffi::c_int) {
unsafe { notify_proc_unsafe(procno) };
}
pub(super) fn callback_set_my_latch() {
unsafe { callback_set_my_latch_unsafe() };
}
pub(super) fn get_request_lsn() -> Lsn {
Lsn(unsafe { callback_get_request_lsn_unsafe() })
}
pub(super) fn callback_get_lfc_metrics() -> LfcMetrics {
unsafe { callback_get_lfc_metrics_unsafe() }
}

View File

@@ -19,105 +19,71 @@ use http::StatusCode;
use http::header::CONTENT_TYPE;
use measured::MetricGroup;
use measured::metric::MetricEncoding;
use measured::metric::gauge::GaugeState;
use measured::metric::group::Encoding;
use measured::text::BufferedTextEncoder;
use std::io::ErrorKind;
use std::sync::Arc;
use tokio::net::UnixListener;
use crate::NEON_COMMUNICATOR_SOCKET_NAME;
use crate::worker_process::lfc_metrics::LfcMetricsCollector;
use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
enum ControlSocketState<'a> {
Full(&'a CommunicatorWorkerProcessStruct<'a>),
Legacy(LegacyControlSocketState),
}
impl CommunicatorWorkerProcessStruct {
/// Launch the listener
pub(crate) async fn launch_control_socket_listener(
&'static self,
) -> Result<(), std::io::Error> {
use axum::routing::get;
let app = Router::new()
.route("/metrics", get(get_metrics))
.route("/autoscaling_metrics", get(get_autoscaling_metrics))
.route("/debug/panic", get(handle_debug_panic))
.with_state(self);
struct LegacyControlSocketState {
pub(crate) lfc_metrics: LfcMetricsCollector,
}
// If the server is restarted, there might be an old socket still
// lying around. Remove it first.
match std::fs::remove_file(NEON_COMMUNICATOR_SOCKET_NAME) {
Ok(()) => {
tracing::warn!("removed stale control socket");
}
Err(e) if e.kind() == ErrorKind::NotFound => {}
Err(e) => {
tracing::error!("could not remove stale control socket: {e:#}");
// Try to proceed anyway. It will likely fail below though.
}
};
// Create the unix domain socket and start listening on it
let listener = UnixListener::bind(NEON_COMMUNICATOR_SOCKET_NAME)?;
tokio::spawn(async {
tracing::info!("control socket listener spawned");
axum::serve(listener, app)
.await
.expect("axum::serve never returns")
});
impl<T> MetricGroup<T> for LegacyControlSocketState
where
T: Encoding,
GaugeState: MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
self.lfc_metrics.collect_group_into(enc)?;
Ok(())
}
}
/// Launch the listener
pub(crate) async fn launch_listener(
worker: Option<&'static CommunicatorWorkerProcessStruct<'static>>,
) -> Result<(), std::io::Error> {
use axum::routing::get;
let state = match worker {
Some(worker) => ControlSocketState::Full(worker),
None => ControlSocketState::Legacy(LegacyControlSocketState {
lfc_metrics: LfcMetricsCollector,
}),
};
let app = Router::new()
.route("/metrics", get(get_metrics))
.route("/autoscaling_metrics", get(get_autoscaling_metrics))
.route("/debug/panic", get(handle_debug_panic))
.route("/debug/dump_cache_map", get(dump_cache_map))
.with_state(Arc::new(state));
// If the server is restarted, there might be an old socket still
// lying around. Remove it first.
match std::fs::remove_file(NEON_COMMUNICATOR_SOCKET_NAME) {
Ok(()) => {
tracing::warn!("removed stale control socket");
}
Err(e) if e.kind() == ErrorKind::NotFound => {}
Err(e) => {
tracing::error!("could not remove stale control socket: {e:#}");
// Try to proceed anyway. It will likely fail below though.
}
};
// Create the unix domain socket and start listening on it
let listener = UnixListener::bind(NEON_COMMUNICATOR_SOCKET_NAME)?;
tokio::spawn(async {
tracing::info!("control socket listener spawned");
axum::serve(listener, app)
.await
.expect("axum::serve never returns")
});
Ok(())
}
/// Expose all Prometheus metrics.
async fn get_metrics(State(state): State<Arc<ControlSocketState<'_>>>) -> Response {
match state.as_ref() {
ControlSocketState::Full(worker) => metrics_to_response(&worker).await,
ControlSocketState::Legacy(legacy) => metrics_to_response(&legacy).await,
}
async fn get_metrics(State(state): State<&CommunicatorWorkerProcessStruct>) -> Response {
tracing::trace!("/metrics requested");
metrics_to_response(&state).await
}
/// Expose Prometheus metrics, for use by the autoscaling agent.
///
/// This is a subset of all the metrics.
async fn get_autoscaling_metrics(State(state): State<Arc<ControlSocketState<'_>>>) -> Response {
match state.as_ref() {
ControlSocketState::Full(worker) => metrics_to_response(&worker.lfc_metrics).await,
ControlSocketState::Legacy(legacy) => metrics_to_response(&legacy.lfc_metrics).await,
}
async fn get_autoscaling_metrics(
State(state): State<&CommunicatorWorkerProcessStruct>,
) -> Response {
tracing::trace!("/metrics requested");
metrics_to_response(&state.lfc_metrics).await
}
async fn handle_debug_panic(State(_state): State<Arc<ControlSocketState<'_>>>) -> Response {
async fn handle_debug_panic(State(_state): State<&CommunicatorWorkerProcessStruct>) -> Response {
panic!("test HTTP handler task panic");
}
@@ -134,23 +100,3 @@ async fn metrics_to_response(metrics: &(dyn MetricGroup<BufferedTextEncoder> + S
.body(Body::from(enc.finish()))
.unwrap()
}
async fn dump_cache_map(State(state): State<Arc<ControlSocketState<'_>>>) -> Response {
match state.as_ref() {
ControlSocketState::Full(worker) => {
let mut buf: Vec<u8> = Vec::new();
worker.cache.dump_map(&mut buf);
Response::builder()
.status(StatusCode::OK)
.header(CONTENT_TYPE, "application/text")
.body(Body::from(buf))
.unwrap()
}
ControlSocketState::Legacy(_) => Response::builder()
.status(StatusCode::NOT_FOUND)
.header(CONTENT_TYPE, "application/text")
.body(Body::from(Vec::new()))
.unwrap(),
}
}

View File

@@ -1,95 +0,0 @@
//! Lock table to ensure that only one IO request is in flight for a given
//! block (or relation or database metadata) at a time
use std::cmp::Eq;
use std::hash::Hash;
use std::sync::Arc;
use tokio::sync::{Mutex, OwnedMutexGuard};
use clashmap::ClashMap;
use clashmap::Entry;
use pageserver_page_api::RelTag;
#[derive(Clone, Eq, Hash, PartialEq)]
pub enum RequestInProgressKey {
Db(u32),
Rel(RelTag),
Block(RelTag, u32),
}
type RequestId = u64;
pub type RequestInProgressTable = MutexHashMap<RequestInProgressKey, RequestId>;
// more primitive locking thingie:
pub struct MutexHashMap<K, V>
where
K: Clone + Eq + Hash,
{
lock_table: ClashMap<K, (V, Arc<Mutex<()>>)>,
}
pub struct MutexHashMapGuard<'a, K, V>
where
K: Clone + Eq + Hash,
{
pub key: K,
map: &'a MutexHashMap<K, V>,
mutex: Arc<Mutex<()>>,
_guard: OwnedMutexGuard<()>,
}
impl<'a, K, V> Drop for MutexHashMapGuard<'a, K, V>
where
K: Clone + Eq + Hash,
{
fn drop(&mut self) {
let (_old_key, old_val) = self.map.lock_table.remove(&self.key).unwrap();
assert!(Arc::ptr_eq(&old_val.1, &self.mutex));
// the guard will be dropped as we return
}
}
impl<K, V> MutexHashMap<K, V>
where
K: Clone + Eq + Hash,
V: std::fmt::Display + Copy,
{
pub fn new() -> MutexHashMap<K, V> {
MutexHashMap {
lock_table: ClashMap::new(),
}
}
pub async fn lock<'a>(&'a self, key: K, val: V) -> MutexHashMapGuard<'a, K, V> {
let my_mutex = Arc::new(Mutex::new(()));
let my_guard = Arc::clone(&my_mutex).lock_owned().await;
loop {
let (request_id, lock) = match self.lock_table.entry(key.clone()) {
Entry::Occupied(e) => {
let e = e.get();
(e.0, Arc::clone(&e.1))
}
Entry::Vacant(e) => {
e.insert((val, Arc::clone(&my_mutex)));
break;
}
};
tracing::info!("waiting for conflicting IO {request_id} to complete");
let _ = lock.lock().await;
tracing::info!("conflicting IO {request_id} completed");
}
MutexHashMapGuard {
key,
map: self,
mutex: my_mutex,
_guard: my_guard,
}
}
}

View File

@@ -1,126 +1,34 @@
use std::collections::HashMap;
use std::os::fd::AsRawFd;
use std::os::fd::OwnedFd;
use std::path::PathBuf;
use std::str::FromStr as _;
use crate::backend_comms::NeonIORequestSlot;
use crate::file_cache::FileCache;
use crate::global_allocator::MyAllocatorCollector;
use crate::init::CommunicatorInitStruct;
use crate::integrated_cache::{CacheResult, IntegratedCacheWriteAccess};
use crate::neon_request::{CGetPageVRequest, CGetPageVUncachedRequest, CPrefetchVRequest};
use crate::neon_request::{INVALID_BLOCK_NUMBER, NeonIORequest, NeonIOResult};
use crate::worker_process::control_socket;
use crate::worker_process::in_progress_ios::{RequestInProgressKey, RequestInProgressTable};
use crate::worker_process::lfc_metrics::LfcMetricsCollector;
use pageserver_client_grpc::{PageserverClient, ShardSpec, ShardStripeSize};
use pageserver_page_api as page_api;
use tokio::io::AsyncReadExt;
use tokio_pipe::PipeRead;
use uring_common::buf::IoBuf;
use measured::MetricGroup;
use measured::metric::MetricEncoding;
use measured::metric::counter::CounterState;
use measured::metric::gauge::GaugeState;
use measured::metric::group::Encoding;
use measured::{Gauge, GaugeVec};
use utils::id::{TenantId, TimelineId};
use super::callbacks::{get_request_lsn, notify_proc};
use tracing::{error, info, info_span, trace};
use utils::lsn::Lsn;
pub struct CommunicatorWorkerProcessStruct<'a> {
/// Tokio runtime that the main loop and any other related tasks runs in.
pub(crate) runtime: tokio::runtime::Runtime,
/// Client to communicate with the pageserver
client: PageserverClient,
/// Request slots that backends use to send IO requests to the communicator.
neon_request_slots: &'a [NeonIORequestSlot],
/// Notification pipe. Backends use this to notify the communicator that a request is waiting to
/// be processed in one of the request slots.
submission_pipe_read_fd: OwnedFd,
/// Locking table for all in-progress IO requests.
in_progress_table: RequestInProgressTable,
/// Local File Cache, relation size tracking, last-written LSN tracking
pub(crate) cache: IntegratedCacheWriteAccess<'a>,
pub struct CommunicatorWorkerProcessStruct {
runtime: tokio::runtime::Runtime,
/*** Metrics ***/
pub(crate) lfc_metrics: LfcMetricsCollector,
request_counters: GaugeVec<RequestTypeLabelGroupSet>,
getpage_cache_misses_counter: Gauge,
getpage_cache_hits_counter: Gauge,
// For the requests that affect multiple blocks, have separate counters for the # of blocks affected
request_nblocks_counters: GaugeVec<RequestTypeLabelGroupSet>,
allocator_metrics: MyAllocatorCollector,
}
// Define a label group, consisting of 1 or more label values
#[derive(measured::LabelGroup)]
#[label(set = RequestTypeLabelGroupSet)]
struct RequestTypeLabelGroup {
request_type: crate::neon_request::NeonIORequestDiscriminants,
}
impl RequestTypeLabelGroup {
fn from_req(req: &NeonIORequest) -> Self {
RequestTypeLabelGroup {
request_type: req.into(),
}
}
}
/// Launch the communicator process's Rust subsystems
#[allow(clippy::too_many_arguments)]
pub(super) fn init_legacy() -> Result<(), String> {
let runtime = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.thread_name("communicator thread")
.build()
.unwrap();
// Start the listener on the control socket
runtime
.block_on(control_socket::launch_listener(None))
.map_err(|e| e.to_string())?;
Box::leak(Box::new(runtime));
Ok(())
}
/// Launch the communicator process's Rust subsystems
#[allow(clippy::too_many_arguments)]
pub(super) fn init(
cis: CommunicatorInitStruct,
tenant_id: &str,
timeline_id: &str,
auth_token: Option<&str>,
shard_map: HashMap<utils::shard::ShardIndex, String>,
stripe_size: Option<ShardStripeSize>,
initial_file_cache_size: u64,
file_cache_path: Option<PathBuf>,
) -> Result<&'static CommunicatorWorkerProcessStruct<'static>, String> {
tenant_id: Option<&str>,
timeline_id: Option<&str>,
) -> Result<&'static CommunicatorWorkerProcessStruct, String> {
// The caller validated these already
let tenant_id = TenantId::from_str(tenant_id).map_err(|e| format!("invalid tenant ID: {e}"))?;
let timeline_id =
TimelineId::from_str(timeline_id).map_err(|e| format!("invalid timeline ID: {e}"))?;
let shard_spec =
ShardSpec::new(shard_map, stripe_size).map_err(|e| format!("invalid shard spec: {e}:"))?;
let _tenant_id = tenant_id
.map(TenantId::from_str)
.transpose()
.map_err(|e| format!("invalid tenant ID: {e}"))?;
let _timeline_id = timeline_id
.map(TimelineId::from_str)
.transpose()
.map_err(|e| format!("invalid timeline ID: {e}"))?;
let runtime = tokio::runtime::Builder::new_multi_thread()
.enable_all()
@@ -128,716 +36,31 @@ pub(super) fn init(
.build()
.unwrap();
let last_lsn = get_request_lsn();
let file_cache = if let Some(path) = file_cache_path {
Some(FileCache::new(&path, initial_file_cache_size).expect("could not create cache file"))
} else {
// FIXME: temporarily for testing, use LFC even if disabled
Some(
FileCache::new(&PathBuf::from("new_filecache"), 1000)
.expect("could not create cache file"),
)
};
// Initialize subsystems
let cache = cis
.integrated_cache_init_struct
.worker_process_init(last_lsn, file_cache);
let client = {
let _guard = runtime.enter();
PageserverClient::new(
tenant_id,
timeline_id,
shard_spec,
auth_token.map(|s| s.to_string()),
None,
)
.expect("could not create client")
};
let worker_struct = CommunicatorWorkerProcessStruct {
// Note: it's important to not drop the runtime, or all the tasks are dropped
// too. Including it in the returned struct is one way to keep it around.
runtime,
neon_request_slots: cis.neon_request_slots,
client,
cache,
submission_pipe_read_fd: cis.submission_pipe_read_fd,
in_progress_table: RequestInProgressTable::new(),
// metrics
lfc_metrics: LfcMetricsCollector,
request_counters: GaugeVec::new(),
getpage_cache_misses_counter: Gauge::new(),
getpage_cache_hits_counter: Gauge::new(),
request_nblocks_counters: GaugeVec::new(),
allocator_metrics: MyAllocatorCollector::new(),
};
let worker_struct = Box::leak(Box::new(worker_struct));
let main_loop_handle = worker_struct.runtime.spawn(worker_struct.run());
worker_struct.runtime.spawn(async {
let err = main_loop_handle.await.unwrap_err();
error!("error: {err:?}");
});
// Start the listener on the control socket
worker_struct
.runtime
.block_on(control_socket::launch_listener(Some(worker_struct)))
.block_on(worker_struct.launch_control_socket_listener())
.map_err(|e| e.to_string())?;
Ok(worker_struct)
}
impl<'t> CommunicatorWorkerProcessStruct<'t> {
/// Update the configuration
pub(super) fn update_shard_map(
&self,
new_shard_map: HashMap<utils::shard::ShardIndex, String>,
stripe_size: Option<ShardStripeSize>,
) {
let shard_spec = ShardSpec::new(new_shard_map, stripe_size).expect("invalid shard spec");
{
let _in_runtime = self.runtime.enter();
if let Err(err) = self.client.update_shards(shard_spec) {
tracing::error!("could not update shard map: {err:?}");
}
}
}
/// Main loop of the worker process. Receive requests from the backends and process them.
pub(super) async fn run(&'static self) {
let mut idxbuf: [u8; 4] = [0; 4];
let mut submission_pipe_read =
PipeRead::try_from(self.submission_pipe_read_fd.as_raw_fd()).expect("invalid pipe fd");
loop {
// Wait for a backend to ring the doorbell
match submission_pipe_read.read(&mut idxbuf).await {
Ok(4) => {}
Ok(nbytes) => panic!("short read ({nbytes} bytes) on communicator pipe"),
Err(e) => panic!("error reading from communicator pipe: {e}"),
}
let slot_idx = u32::from_ne_bytes(idxbuf) as usize;
// Read the IO request from the slot indicated in the wakeup
let Some(slot) = self.neon_request_slots[slot_idx].start_processing_request() else {
// This currently should not happen. But if we had multiple threads picking up
// requests, and without waiting for the notifications, it could.
panic!("no request in slot");
};
// Ok, we have ownership of this request now. We must process it now, there's no going
// back.
//
// Spawn a separate task for every request. That's a little excessive for requests that
// can be quickly satisfied from the cache, but we expect that to be rare, because the
// requesting backend would have already checked the cache.
tokio::spawn(async move {
use tracing::Instrument;
let request_id = slot.get_request().request_id();
let owner_procno = slot.get_owner_procno();
let span = info_span!(
"processing",
request_id = request_id,
slot_idx = slot_idx,
procno = owner_procno,
);
async {
// FIXME: as a temporary hack, abort the request if we don't get a response
// promptly.
//
// Lots of regression tests are getting stuck and failing at the moment,
// this makes them fail a little faster, which it faster to iterate.
// This needs to be removed once more regression tests are passing.
// See also similar hack in the backend code, in wait_request_completion()
let result = tokio::time::timeout(
tokio::time::Duration::from_secs(60),
self.handle_request(slot.get_request()),
)
.await
.unwrap_or_else(|_elapsed| {
info!("request {request_id} timed out");
NeonIOResult::Error(libc::ETIMEDOUT)
});
trace!("request {request_id} at slot {slot_idx} completed");
// Ok, we have completed the IO. Mark the request as completed. After that,
// we no longer have ownership of the slot, and must not modify it.
slot.completed(result);
// Notify the backend about the completion. (Note that the backend might see
// the completed status even before this; this is just a wakeup)
notify_proc(owner_procno);
}
.instrument(span)
.await
});
}
}
/// Compute the 'request_lsn' to use for a pageserver request
fn request_lsns(&self, not_modified_since_lsn: Lsn) -> page_api::ReadLsn {
let mut request_lsn = get_request_lsn();
// Is it possible that the last-written LSN is ahead of last flush LSN? Generally not, we
// shouldn't evict a page from the buffer cache before all its modifications have been
// safely flushed. That's the "WAL before data" rule. However, there are a few exceptions:
//
// - when creation an index: _bt_blwritepage logs the full page without flushing WAL before
// smgrextend (files are fsynced before build ends).
//
// XXX: If we make a request LSN greater than the current WAL flush LSN, the pageserver would
// block waiting for the WAL arrive, until we flush it and it propagates through the
// safekeepers to the pageserver. If there's nothing that forces the WAL to be flushed,
// the pageserver would get stuck waiting forever. To avoid that, all the write-
// functions in communicator_new.c call XLogSetAsyncXactLSN(). That nudges the WAL writer to
// perform the flush relatively soon.
//
// It would perhaps be nicer to do the WAL flush here, but it's tricky to call back into
// Postgres code to do that from here. That's why we rely on communicator_new.c to do the
// calls "pre-emptively".
//
// FIXME: Because of the above, it can still happen that the flush LSN is ahead of
// not_modified_since, if the WAL writer hasn't done the flush yet. It would be nice to know
// if there are other cases like that that we have mised, but unfortunately we cannot turn
// this into an assertion because of that legit case.
//
// See also the old logic in neon_get_request_lsns() C function
if not_modified_since_lsn > request_lsn {
tracing::info!(
"not_modified_since_lsn {} is ahead of last flushed LSN {}",
not_modified_since_lsn,
request_lsn
);
request_lsn = not_modified_since_lsn;
}
page_api::ReadLsn {
request_lsn,
not_modified_since_lsn: Some(not_modified_since_lsn),
}
}
/// Handle one IO request
async fn handle_request(&'static self, request: &'_ NeonIORequest) -> NeonIOResult {
self.request_counters
.inc(RequestTypeLabelGroup::from_req(request));
match request {
NeonIORequest::Empty => {
error!("unexpected Empty IO request");
NeonIOResult::Error(0)
}
NeonIORequest::RelSize(req) => {
let rel = req.reltag();
let _in_progress_guard = self
.in_progress_table
.lock(RequestInProgressKey::Rel(rel), req.request_id)
.await;
// Check the cache first
let not_modified_since = match self.cache.get_rel_size(&rel) {
CacheResult::Found(nblocks) => {
tracing::trace!("found relsize for {:?} in cache: {}", rel, nblocks);
return NeonIOResult::RelSize(nblocks);
}
// XXX: we don't cache negative entries, so if there's no entry in the cache, it could mean
// that the relation doesn't exist or that we don't have it cached.
CacheResult::NotFound(lsn) => lsn,
};
let read_lsn = self.request_lsns(not_modified_since);
match self
.client
.get_rel_size(page_api::GetRelSizeRequest {
read_lsn,
rel,
allow_missing: req.allow_missing,
})
.await
{
Ok(Some(nblocks)) => {
// update the cache
tracing::trace!(
"updated relsize for {:?} in cache: {}, lsn {}",
rel,
nblocks,
read_lsn
);
self.cache
.remember_rel_size(&rel, nblocks, not_modified_since);
NeonIOResult::RelSize(nblocks)
}
Ok(None) => {
// TODO: cache negative entry?
NeonIOResult::RelSize(INVALID_BLOCK_NUMBER)
}
Err(err) => {
// FIXME: Could we map the tonic StatusCode to a libc errno in a more fine-grained way? Or pass the error message to the backend
info!("tonic error: {err:?}");
NeonIOResult::Error(libc::EIO)
}
}
}
NeonIORequest::GetPageV(req) => match self.handle_get_pagev_request(req).await {
Ok(()) => NeonIOResult::GetPageV,
Err(errno) => NeonIOResult::Error(errno),
},
NeonIORequest::GetPageVUncached(req) => {
match self.handle_get_pagev_uncached_request(req).await {
Ok(()) => NeonIOResult::GetPageV,
Err(errno) => NeonIOResult::Error(errno),
}
}
NeonIORequest::ReadSlruSegment(req) => {
let lsn = Lsn(req.request_lsn);
let file_path = req.destination_file_path();
match self
.client
.get_slru_segment(page_api::GetSlruSegmentRequest {
read_lsn: self.request_lsns(lsn),
kind: req.slru_kind,
segno: req.segment_number,
})
.await
{
Ok(slru_bytes) => {
if let Err(e) = tokio::fs::write(&file_path, &slru_bytes).await {
error!("could not write slru segment to file {file_path}: {e}");
return NeonIOResult::Error(e.raw_os_error().unwrap_or(libc::EIO));
}
let blocks_count = slru_bytes.len() / crate::BLCKSZ;
NeonIOResult::ReadSlruSegment(blocks_count as _)
}
Err(err) => {
// FIXME: Could we map the tonic StatusCode to a libc errno in a more fine-grained way? Or pass the error message to the backend
info!("tonic error: {err:?}");
NeonIOResult::Error(libc::EIO)
}
}
}
NeonIORequest::PrefetchV(req) => {
self.request_nblocks_counters
.inc_by(RequestTypeLabelGroup::from_req(request), req.nblocks as i64);
let req = *req;
// FIXME: handle_request() runs in a separate task already, do we really need to spawn a new one here?
tokio::spawn(async move { self.handle_prefetchv_request(&req).await });
NeonIOResult::PrefetchVLaunched
}
NeonIORequest::DbSize(req) => {
let _in_progress_guard = self
.in_progress_table
.lock(RequestInProgressKey::Db(req.db_oid), req.request_id)
.await;
// Check the cache first
let not_modified_since = match self.cache.get_db_size(req.db_oid) {
CacheResult::Found(db_size) => {
// get_page already copied the block content to the destination
return NeonIOResult::DbSize(db_size);
}
CacheResult::NotFound(lsn) => lsn,
};
match self
.client
.get_db_size(page_api::GetDbSizeRequest {
read_lsn: self.request_lsns(not_modified_since),
db_oid: req.db_oid,
})
.await
{
Ok(db_size) => NeonIOResult::DbSize(db_size),
Err(err) => {
// FIXME: Could we map the tonic StatusCode to a libc errno in a more fine-grained way? Or pass the error message to the backend
info!("tonic error: {err:?}");
NeonIOResult::Error(libc::EIO)
}
}
}
// Write requests
NeonIORequest::WritePage(req) => {
let rel = req.reltag();
let _in_progress_guard = self
.in_progress_table
.lock(
RequestInProgressKey::Block(rel, req.block_number),
req.request_id,
)
.await;
// We must at least update the last-written LSN on the page, but also store the page
// image in the LFC while we still have it
self.cache
.remember_page(&rel, req.block_number, req.src, Lsn(req.lsn), true)
.await;
NeonIOResult::WriteOK
}
NeonIORequest::RelExtend(req) => {
let rel = req.reltag();
let _in_progress_guard = self
.in_progress_table
.lock(
RequestInProgressKey::Block(rel, req.block_number),
req.request_id,
)
.await;
// We must at least update the last-written LSN on the page and the relation size,
// but also store the page image in the LFC while we still have it
self.cache
.remember_page(&rel, req.block_number, req.src, Lsn(req.lsn), true)
.await;
self.cache
.remember_rel_size(&req.reltag(), req.block_number + 1, Lsn(req.lsn));
NeonIOResult::WriteOK
}
NeonIORequest::RelZeroExtend(req) => {
self.request_nblocks_counters
.inc_by(RequestTypeLabelGroup::from_req(request), req.nblocks as i64);
// TODO: need to grab an io-in-progress lock for this? I guess not
// TODO: We could put the empty pages to the cache. Maybe have
// a marker on the block entries for all-zero pages, instead of
// actually storing the empty pages.
self.cache.remember_rel_size(
&req.reltag(),
req.block_number + req.nblocks,
Lsn(req.lsn),
);
NeonIOResult::WriteOK
}
NeonIORequest::RelCreate(req) => {
// TODO: need to grab an io-in-progress lock for this? I guess not
self.cache.remember_rel_size(&req.reltag(), 0, Lsn(req.lsn));
NeonIOResult::WriteOK
}
NeonIORequest::RelTruncate(req) => {
// TODO: need to grab an io-in-progress lock for this? I guess not
self.cache
.remember_rel_size(&req.reltag(), req.nblocks, Lsn(req.lsn));
NeonIOResult::WriteOK
}
NeonIORequest::RelUnlink(req) => {
// TODO: need to grab an io-in-progress lock for this? I guess not
self.cache.forget_rel(&req.reltag(), None, Lsn(req.lsn));
NeonIOResult::WriteOK
}
NeonIORequest::UpdateCachedRelSize(req) => {
// TODO: need to grab an io-in-progress lock for this? I guess not
self.cache
.remember_rel_size(&req.reltag(), req.nblocks, Lsn(req.lsn));
NeonIOResult::WriteOK
}
}
}
/// Subroutine to handle a GetPageV request, since it's a little more complicated than
/// others.
async fn handle_get_pagev_request(&'t self, req: &CGetPageVRequest) -> Result<(), i32> {
let rel = req.reltag();
// Check the cache first
//
// Note: Because the backends perform a direct lookup in the cache before sending
// the request to the communicator process, we expect the pages to almost never
// be already in cache. It could happen if:
// 1. two backends try to read the same page at the same time, but that should never
// happen because there's higher level locking in the Postgres buffer manager, or
// 2. a prefetch request finished at the same time as a backend requested the
// page. That's much more likely.
let mut cache_misses = Vec::with_capacity(req.nblocks as usize);
for i in 0..req.nblocks {
let blkno = req.block_number + i as u32;
// note: this is deadlock-safe even though we hold multiple locks at the same time,
// because they're always acquired in the same order.
let in_progress_guard = self
.in_progress_table
.lock(RequestInProgressKey::Block(rel, blkno), req.request_id)
.await;
let dest = req.dest[i as usize];
let not_modified_since = match self.cache.get_page(&rel, blkno, dest).await {
Ok(CacheResult::Found(_)) => {
// get_page already copied the block content to the destination
trace!("found blk {} in rel {:?} in LFC", blkno, rel);
continue;
}
Ok(CacheResult::NotFound(lsn)) => lsn,
Err(_io_error) => return Err(libc::EIO), // FIXME print the error?
};
cache_misses.push((blkno, not_modified_since, dest, in_progress_guard));
}
self.getpage_cache_misses_counter
.inc_by(cache_misses.len() as i64);
self.getpage_cache_hits_counter
.inc_by(req.nblocks as i64 - cache_misses.len() as i64);
if cache_misses.is_empty() {
return Ok(());
}
let not_modified_since = cache_misses
.iter()
.map(|(_blkno, lsn, _dest, _guard)| *lsn)
.max()
.unwrap();
// Construct a pageserver request for the cache misses
let block_numbers: Vec<u32> = cache_misses
.iter()
.map(|(blkno, _lsn, _dest, _guard)| *blkno)
.collect();
let read_lsn = self.request_lsns(not_modified_since);
trace!(
"sending getpage request for blocks {:?} in rel {:?} lsns {}",
block_numbers, rel, read_lsn
);
match self
.client
.get_page(page_api::GetPageRequest {
request_id: req.request_id.into(),
request_class: page_api::GetPageClass::Normal,
read_lsn,
rel,
block_numbers: block_numbers.clone(),
})
.await
{
Ok(resp) => {
// Write the received page images directly to the shared memory location
// that the backend requested.
if resp.pages.len() != block_numbers.len() {
error!(
"received unexpected response with {} page images from pageserver for a request for {} pages",
resp.pages.len(),
block_numbers.len(),
);
return Err(libc::EIO);
}
trace!(
"received getpage response for blocks {:?} in rel {:?} lsns {}",
block_numbers, rel, read_lsn
);
for (page, (blkno, _lsn, dest, _guard)) in resp.pages.into_iter().zip(cache_misses)
{
let src: &[u8] = page.image.as_ref();
let len = std::cmp::min(src.len(), dest.bytes_total());
unsafe {
std::ptr::copy_nonoverlapping(src.as_ptr(), dest.as_mut_ptr(), len);
};
// Also store it in the LFC while we have it
self.cache
.remember_page(
&rel,
blkno,
page.image,
read_lsn.not_modified_since_lsn.unwrap(),
false,
)
.await;
}
}
Err(err) => {
// FIXME: Could we map the tonic StatusCode to a libc errno in a more fine-grained way? Or pass the error message to the backend
info!("tonic error: {err:?}");
return Err(libc::EIO);
}
}
Ok(())
}
/// Subroutine to handle an GetPageVUncached request.
///
/// Note: this bypasses the cache, in-progress IO locking, and all other side-effects.
/// This request type is only used in tests.
async fn handle_get_pagev_uncached_request(
&'t self,
req: &CGetPageVUncachedRequest,
) -> Result<(), i32> {
let rel = req.reltag();
// Construct a pageserver request
let block_numbers: Vec<u32> =
(req.block_number..(req.block_number + (req.nblocks as u32))).collect();
let read_lsn = page_api::ReadLsn {
request_lsn: Lsn(req.request_lsn),
not_modified_since_lsn: Some(Lsn(req.not_modified_since)),
};
trace!(
"sending (uncached) getpage request for blocks {:?} in rel {:?} lsns {}",
block_numbers, rel, read_lsn
);
match self
.client
.get_page(page_api::GetPageRequest {
request_id: req.request_id.into(),
request_class: page_api::GetPageClass::Normal,
read_lsn,
rel,
block_numbers: block_numbers.clone(),
})
.await
{
Ok(resp) => {
// Write the received page images directly to the shared memory location
// that the backend requested.
if resp.pages.len() != block_numbers.len() {
error!(
"received unexpected response with {} page images from pageserver for a request for {} pages",
resp.pages.len(),
block_numbers.len(),
);
return Err(libc::EIO);
}
trace!(
"received getpage response for blocks {:?} in rel {:?} lsns {}",
block_numbers, rel, read_lsn
);
for (page, dest) in resp.pages.into_iter().zip(req.dest) {
let src: &[u8] = page.image.as_ref();
let len = std::cmp::min(src.len(), dest.bytes_total());
unsafe {
std::ptr::copy_nonoverlapping(src.as_ptr(), dest.as_mut_ptr(), len);
};
}
}
Err(err) => {
// FIXME: Could we map the tonic StatusCode to a libc errno in a more fine-grained way? Or pass the error message to the backend
info!("tonic error: {err:?}");
return Err(libc::EIO);
}
}
Ok(())
}
/// Subroutine to handle a PrefetchV request, since it's a little more complicated than
/// others.
///
/// This is very similar to a GetPageV request, but the results are only stored in the cache.
async fn handle_prefetchv_request(&'static self, req: &CPrefetchVRequest) -> Result<(), i32> {
let rel = req.reltag();
// Check the cache first
let mut cache_misses = Vec::with_capacity(req.nblocks as usize);
for i in 0..req.nblocks {
let blkno = req.block_number + i as u32;
// note: this is deadlock-safe even though we hold multiple locks at the same time,
// because they're always acquired in the same order.
let in_progress_guard = self
.in_progress_table
.lock(RequestInProgressKey::Block(rel, blkno), req.request_id)
.await;
let not_modified_since = match self.cache.page_is_cached(&rel, blkno).await {
Ok(CacheResult::Found(_)) => {
trace!("found blk {} in rel {:?} in LFC", blkno, rel);
continue;
}
Ok(CacheResult::NotFound(lsn)) => lsn,
Err(_io_error) => return Err(libc::EIO), // FIXME print the error?
};
cache_misses.push((blkno, not_modified_since, in_progress_guard));
}
if cache_misses.is_empty() {
return Ok(());
}
let not_modified_since = cache_misses
.iter()
.map(|(_blkno, lsn, _guard)| *lsn)
.max()
.unwrap();
let block_numbers: Vec<u32> = cache_misses
.iter()
.map(|(blkno, _lsn, _guard)| *blkno)
.collect();
// TODO: spawn separate tasks for these. Use the integrated cache to keep track of the
// in-flight requests
match self
.client
.get_page(page_api::GetPageRequest {
request_id: req.request_id.into(),
request_class: page_api::GetPageClass::Prefetch,
read_lsn: self.request_lsns(not_modified_since),
rel,
block_numbers: block_numbers.clone(),
})
.await
{
Ok(resp) => {
trace!(
"prefetch completed, remembering blocks {:?} in rel {:?} in LFC",
block_numbers, rel
);
if resp.pages.len() != block_numbers.len() {
error!(
"received unexpected response with {} page images from pageserver for a request for {} pages",
resp.pages.len(),
block_numbers.len(),
);
return Err(libc::EIO);
}
for (page, (blkno, _lsn, _guard)) in resp.pages.into_iter().zip(cache_misses) {
self.cache
.remember_page(&rel, blkno, page.image, not_modified_since, false)
.await;
}
}
Err(err) => {
// FIXME: Could we map the tonic StatusCode to a libc errno in a more fine-grained way? Or pass the error message to the backend
info!("tonic error: {err:?}");
return Err(libc::EIO);
}
}
Ok(())
}
}
impl<T> MetricGroup<T> for CommunicatorWorkerProcessStruct<'_>
impl<T> MetricGroup<T> for CommunicatorWorkerProcessStruct
where
T: Encoding,
CounterState: MetricEncoding<T>,
GaugeState: MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
use measured::metric::MetricFamilyEncoding;
use measured::metric::name::MetricName;
self.lfc_metrics.collect_group_into(enc)?;
self.cache.collect_group_into(enc)?;
self.request_counters
.collect_family_into(MetricName::from_str("request_counters"), enc)?;
self.request_nblocks_counters
.collect_family_into(MetricName::from_str("request_nblocks_counters"), enc)?;
self.allocator_metrics.collect_group_into(enc)?;
Ok(())
self.lfc_metrics.collect_group_into(enc)
}
}

View File

@@ -4,9 +4,9 @@
//! - launch the main loop,
//! - receive IO requests from backends and process them,
//! - write results back to backends.
mod callbacks;
mod control_socket;
mod in_progress_ios;
mod lfc_metrics;
mod logging;
mod main_loop;

View File

@@ -1,21 +1,14 @@
//! Functions called from the C code in the worker process
use std::collections::HashMap;
use std::ffi::{CStr, CString, c_char};
use std::path::PathBuf;
use crate::init::CommunicatorInitStruct;
use crate::worker_process::main_loop;
use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
use pageserver_client_grpc::ShardStripeSize;
/// Launch the communicator's tokio tasks, which do most of the work.
///
/// The caller has initialized the process as a regular PostgreSQL background worker
/// process. The shared memory segment used to communicate with the backends has been
/// allocated and initialized earlier, at postmaster startup, in
/// rcommunicator_shmem_init().
/// process.
///
/// Inputs:
/// `tenant_id` and `timeline_id` can be NULL, if we're been launched in "non-Neon" mode,
@@ -30,63 +23,27 @@ use pageserver_client_grpc::ShardStripeSize;
/// This is called only once in the process, so the returned struct, and error message in
/// case of failure, are simply leaked.
#[unsafe(no_mangle)]
pub extern "C" fn communicator_worker_process_launch(
cis: Box<CommunicatorInitStruct>,
pub extern "C" fn communicator_worker_launch(
tenant_id: *const c_char,
timeline_id: *const c_char,
auth_token: *const c_char,
shard_map: *mut *mut c_char,
nshards: u32,
stripe_size: u32,
file_cache_path: *const c_char,
initial_file_cache_size: u64,
error_p: *mut *const c_char,
) -> Option<&'static CommunicatorWorkerProcessStruct<'static>> {
tracing::warn!("starting threads in rust code");
) -> Option<&'static CommunicatorWorkerProcessStruct> {
// Convert the arguments into more convenient Rust types
let tenant_id = {
let cstr = unsafe { CStr::from_ptr(tenant_id) };
cstr.to_str().expect("assume UTF-8")
};
let timeline_id = {
let cstr = unsafe { CStr::from_ptr(timeline_id) };
cstr.to_str().expect("assume UTF-8")
};
let auth_token = if auth_token.is_null() {
let tenant_id = if tenant_id.is_null() {
None
} else {
let cstr = unsafe { CStr::from_ptr(auth_token) };
let cstr = unsafe { CStr::from_ptr(tenant_id) };
Some(cstr.to_str().expect("assume UTF-8"))
};
let file_cache_path = {
if file_cache_path.is_null() {
None
} else {
let c_str = unsafe { CStr::from_ptr(file_cache_path) };
Some(PathBuf::from(c_str.to_str().unwrap()))
}
};
let shard_map = shard_map_to_hash(nshards, shard_map);
// FIXME: distinguish between unsharded, and sharded with 1 shard
// Also, we might go from unsharded to sharded while the system
// is running.
let stripe_size = if stripe_size > 0 && nshards > 1 {
Some(ShardStripeSize(stripe_size))
} else {
let timeline_id = if timeline_id.is_null() {
None
} else {
let cstr = unsafe { CStr::from_ptr(timeline_id) };
Some(cstr.to_str().expect("assume UTF-8"))
};
// The `init` function does all the work.
let result = main_loop::init(
*cis,
tenant_id,
timeline_id,
auth_token,
shard_map,
stripe_size,
initial_file_cache_size,
file_cache_path,
);
let result = main_loop::init(tenant_id, timeline_id);
// On failure, return the error message to the C caller in *error_p.
match result {
@@ -101,68 +58,3 @@ pub extern "C" fn communicator_worker_process_launch(
}
}
}
#[unsafe(no_mangle)]
pub extern "C" fn communicator_worker_process_launch_legacy(error_p: *mut *const c_char) -> bool {
// The `init` function does all the work.
let result = main_loop::init_legacy();
// On failure, return the error message to the C caller in *error_p.
match result {
Ok(()) => true,
Err(errmsg) => {
let errmsg = CString::new(errmsg).expect("no nuls within error message");
let errmsg = Box::leak(errmsg.into_boxed_c_str());
let p: *const c_char = errmsg.as_ptr();
unsafe { *error_p = p };
false
}
}
}
/// Convert the "shard map" from an array of C strings, indexed by shard no to a rust HashMap
fn shard_map_to_hash(
nshards: u32,
shard_map: *mut *mut c_char,
) -> HashMap<utils::shard::ShardIndex, String> {
use utils::shard::*;
assert!(nshards <= u8::MAX as u32);
let mut result: HashMap<ShardIndex, String> = HashMap::new();
let mut p = shard_map;
for i in 0..nshards {
let c_str = unsafe { CStr::from_ptr(*p) };
p = unsafe { p.add(1) };
let s = c_str.to_str().unwrap();
let k = if nshards > 1 {
ShardIndex::new(ShardNumber(i as u8), ShardCount(nshards as u8))
} else {
ShardIndex::unsharded()
};
result.insert(k, s.into());
}
result
}
/// Inform the rust code about a configuration change
#[unsafe(no_mangle)]
pub extern "C" fn communicator_worker_config_reload(
proc_handle: &'static CommunicatorWorkerProcessStruct<'static>,
file_cache_size: u64,
shard_map: *mut *mut c_char,
nshards: u32,
stripe_size: u32,
) {
proc_handle.runtime.spawn_blocking(move || {
proc_handle.cache.resize_file_cache(file_cache_size as u32);
});
let shard_map = shard_map_to_hash(nshards, shard_map);
let stripe_size = (nshards > 1).then_some(ShardStripeSize(stripe_size));
proc_handle.update_shard_map(shard_map, stripe_size);
}

Some files were not shown because too many files have changed in this diff Show More