Compare commits

..

12 Commits

Author SHA1 Message Date
Conrad Ludgate
cc66f78d01 update readme 2025-07-31 11:51:44 +01:00
Conrad Ludgate
f9e6802974 s/ssl/tls 2025-07-30 14:03:22 +01:00
Conrad Ludgate
74afc9d96f refactor pgbouncer tuning 2025-07-30 12:34:36 +01:00
Conrad Ludgate
86fe3150f0 add basic tls test 2025-07-30 12:34:31 +01:00
Conrad Ludgate
52be0146d3 fix runtime 2025-07-30 12:32:23 +01:00
Conrad Ludgate
a3f2a2cae5 add fast path for TLS renewal configuration 2025-07-30 12:29:41 +01:00
Conrad Ludgate
a24a0032ad update certificate files in the watch task 2025-07-30 11:47:34 +01:00
Conrad Ludgate
70cb02742a pass in the tls_config as a param to watch_certs_for_changes, also wait for it to complete before configuring pgbouncer/local_proxy 2025-07-30 11:47:07 +01:00
Conrad Ludgate
a845295cb3 refactor TLS processing. Only use blocking-IO, split out the loading of certificates from the updating of certificates 2025-07-30 10:29:03 +01:00
Conrad Ludgate
e288cd2198 fix concurrent reconfigure while TLS configuration is taking place 2025-07-30 10:14:20 +01:00
Conrad Ludgate
ffa9e595b8 introduce separate reload commands 2025-07-30 10:14:17 +01:00
Conrad Ludgate
e7b1f63f68 add logs for TLS 2025-07-30 10:08:04 +01:00
138 changed files with 1372 additions and 8058 deletions

1
.gitignore vendored
View File

@@ -15,7 +15,6 @@ neon.iml
/.neon
/integration_tests/.neon
compaction-suite-results.*
pgxn/neon/communicator/communicator_bindings.h
docker-compose/docker-compose-parallel.yml
# Coverage

108
Cargo.lock generated
View File

@@ -259,17 +259,6 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
[[package]]
name = "atomic_enum"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99e1aca718ea7b89985790c94aad72d77533063fe00bc497bb79a7c2dae6a661"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
]
[[package]]
name = "autocfg"
version = "1.1.0"
@@ -1307,30 +1296,13 @@ dependencies = [
name = "communicator"
version = "0.1.0"
dependencies = [
"atomic_enum",
"axum",
"bytes",
"cbindgen",
"clashmap",
"http 1.3.1",
"libc",
"measured",
"metrics",
"neon-shmem",
"nix 0.30.1",
"pageserver_api",
"pageserver_client_grpc",
"pageserver_page_api",
"prometheus",
"prost 0.13.5",
"strum_macros",
"thiserror 1.0.69",
"tokio",
"tokio-pipe",
"tonic",
"tracing",
"tracing-subscriber",
"uring-common",
"utils",
"workspace_hack",
]
@@ -1671,9 +1643,9 @@ dependencies = [
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
version = "0.8.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
[[package]]
name = "crossterm"
@@ -2389,12 +2361,6 @@ version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "foldhash"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
[[package]]
name = "form_urlencoded"
version = "1.2.1"
@@ -2776,16 +2742,6 @@ version = "0.15.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
[[package]]
name = "hashbrown"
version = "0.15.4"
source = "git+https://github.com/quantumish/hashbrown.git?rev=6610e6d#6610e6d2b1f288ef7b0709a3efefbc846395dc5e"
dependencies = [
"allocator-api2",
"equivalent",
"foldhash",
]
[[package]]
name = "hashlink"
version = "0.9.1"
@@ -3866,7 +3822,7 @@ dependencies = [
"prometheus",
"rand 0.9.1",
"rand_distr",
"twox-hash 1.6.3",
"twox-hash",
]
[[package]]
@@ -3972,21 +3928,15 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
name = "neon-shmem"
version = "0.1.0"
dependencies = [
"ahash",
"criterion",
"hashbrown 0.15.4",
"libc",
"lock_api",
"nix 0.30.1",
"rand 0.9.1",
"rand_distr",
"rustc-hash 2.1.1",
"seahash",
"tempfile",
"thiserror 1.0.69",
"twox-hash 2.1.1",
"workspace_hack",
"xxhash-rust",
]
[[package]]
@@ -4441,16 +4391,13 @@ version = "0.1.0"
dependencies = [
"anyhow",
"async-trait",
"axum",
"bytes",
"camino",
"clap",
"futures",
"hdrhistogram",
"http 1.3.1",
"humantime",
"humantime-serde",
"metrics",
"pageserver_api",
"pageserver_client",
"pageserver_client_grpc",
@@ -4540,7 +4487,6 @@ dependencies = [
"pageserver_client",
"pageserver_compaction",
"pageserver_page_api",
"peekable",
"pem",
"pin-project-lite",
"postgres-protocol",
@@ -4554,7 +4500,6 @@ dependencies = [
"pprof",
"pq_proto",
"procfs",
"prost 0.13.5",
"rand 0.9.1",
"range-set-blaze",
"regex",
@@ -4591,7 +4536,7 @@ dependencies = [
"tower 0.5.2",
"tracing",
"tracing-utils",
"twox-hash 1.6.3",
"twox-hash",
"url",
"utils",
"uuid",
@@ -4803,7 +4748,7 @@ dependencies = [
"paste",
"seq-macro",
"thrift",
"twox-hash 1.6.3",
"twox-hash",
"zstd",
"zstd-sys",
]
@@ -4849,15 +4794,6 @@ dependencies = [
"sha2",
]
[[package]]
name = "peekable"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "225f9651e475709164f871dc2f5724956be59cb9edb055372ffeeab01ec2d20b"
dependencies = [
"smallvec",
]
[[package]]
name = "pem"
version = "3.0.3"
@@ -6557,12 +6493,6 @@ version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "621e3680f3e07db4c9c2c3fb07c6223ab2fab2e54bd3c04c3ae037990f428c32"
[[package]]
name = "seahash"
version = "4.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
[[package]]
name = "sec1"
version = "0.3.0"
@@ -7716,16 +7646,6 @@ dependencies = [
"syn 2.0.100",
]
[[package]]
name = "tokio-pipe"
version = "0.2.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f213a84bffbd61b8fa0ba8a044b4bbe35d471d0b518867181e82bd5c15542784"
dependencies = [
"libc",
"tokio",
]
[[package]]
name = "tokio-postgres"
version = "0.7.10"
@@ -8263,15 +8183,6 @@ dependencies = [
"static_assertions",
]
[[package]]
name = "twox-hash"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b907da542cbced5261bd3256de1b3a1bf340a3d37f93425a07362a1d687de56"
dependencies = [
"rand 0.9.1",
]
[[package]]
name = "typed-json"
version = "0.1.1"
@@ -9102,8 +9013,8 @@ dependencies = [
"clap",
"clap_builder",
"const-oid",
"criterion",
"crossbeam-epoch",
"crossbeam-utils",
"crypto-bigint 0.5.5",
"der 0.7.8",
"deranged",
@@ -9146,6 +9057,7 @@ dependencies = [
"num-iter",
"num-rational",
"num-traits",
"once_cell",
"p256 0.13.2",
"parquet",
"portable-atomic",
@@ -9254,12 +9166,6 @@ version = "0.13.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd"
[[package]]
name = "xxhash-rust"
version = "0.8.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3"
[[package]]
name = "yansi"
version = "1.0.1"

View File

@@ -93,7 +93,6 @@ clap = { version = "4.0", features = ["derive", "env"] }
clashmap = { version = "1.0", features = ["raw-api"] }
comfy-table = "7.1"
const_format = "0.2"
crossbeam-utils = "0.8.21"
crc32c = "0.6"
diatomic-waker = { version = "0.2.3" }
either = "1.8"
@@ -153,7 +152,6 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
parquet_derive = "53"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pem = "3.0.3"
peekable = "0.3.0"
pin-project-lite = "0.2"
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
procfs = "0.16"
@@ -192,7 +190,6 @@ smallvec = "1.11"
smol_str = { version = "0.2.0", features = ["serde"] }
socket2 = "0.5"
spki = "0.7.3"
spin = "0.9.8"
strum = "0.26"
strum_macros = "0.26"
"subtle" = "2.5.0"
@@ -204,6 +201,7 @@ thiserror = "1.0"
tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
tokio = { version = "1.43.1", features = ["macros"] }
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
tokio-io-timeout = "1.2.0"
tokio-postgres-rustls = "0.12.0"
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
@@ -244,9 +242,6 @@ zeroize = "1.8"
env_logger = "0.11"
log = "0.4"
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }

View File

@@ -1 +1 @@
SELECT num_requested AS checkpoints_req FROM pg_catalog.pg_stat_checkpointer;
SELECT num_requested AS checkpoints_req FROM pg_stat_checkpointer;

View File

@@ -1 +1 @@
SELECT checkpoints_req FROM pg_catalog.pg_stat_bgwriter;
SELECT checkpoints_req FROM pg_stat_bgwriter;

View File

@@ -1 +1 @@
SELECT checkpoints_timed FROM pg_catalog.pg_stat_bgwriter;
SELECT checkpoints_timed FROM pg_stat_bgwriter;

View File

@@ -1 +1 @@
SELECT (neon.backpressure_throttling_time()::pg_catalog.float8 / 1000000) AS throttled;
SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled;

View File

@@ -1,4 +1,4 @@
SELECT CASE
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_replay_lsn() - '0/0')::pg_catalog.FLOAT8
ELSE (pg_catalog.pg_current_wal_lsn() - '0/0')::pg_catalog.FLOAT8
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
ELSE (pg_current_wal_lsn() - '0/0')::FLOAT8
END AS lsn;

View File

@@ -1,7 +1,7 @@
SELECT
(SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
-- These temporary snapshot files are renamed to the actual snapshot files
-- after they are completely built. We only WAL-log the completely built
-- snapshot files
(SELECT COUNT(*) FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
(SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;

View File

@@ -1,7 +1,7 @@
SELECT
(SELECT pg_catalog.current_setting('neon.timeline_id')) AS timeline_id,
(SELECT current_setting('neon.timeline_id')) AS timeline_id,
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
-- These temporary snapshot files are renamed to the actual snapshot files
-- after they are completely built. We only WAL-log the completely built
-- snapshot files
(SELECT COALESCE(pg_catalog.sum(size), 0) FROM pg_catalog.pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;
(SELECT COALESCE(sum(size), 0) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;

View File

@@ -1,9 +1,9 @@
SELECT
(SELECT setting FROM pg_catalog.pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
(SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
-- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
-- These temporary snapshot files are renamed to the actual snapshot files
-- after they are completely built. We only WAL-log the completely built
-- snapshot files
(SELECT COALESCE(pg_catalog.sum((pg_catalog.pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
FROM (SELECT * FROM pg_catalog.pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
(SELECT COALESCE(sum((pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
FROM (SELECT * FROM pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
) AS logical_snapshots_bytes;

View File

@@ -1 +1 @@
SELECT pg_catalog.current_setting('max_connections') AS max_connections;
SELECT current_setting('max_connections') as max_connections;

View File

@@ -1,4 +1,4 @@
SELECT datname database_name,
pg_catalog.age(datfrozenxid) frozen_xid_age
FROM pg_catalog.pg_database
age(datfrozenxid) frozen_xid_age
FROM pg_database
ORDER BY frozen_xid_age DESC LIMIT 10;

View File

@@ -1,4 +1,4 @@
SELECT datname database_name,
pg_catalog.mxid_age(datminmxid) min_mxid_age
FROM pg_catalog.pg_database
mxid_age(datminmxid) min_mxid_age
FROM pg_database
ORDER BY min_mxid_age DESC LIMIT 10;

View File

@@ -1,4 +1,4 @@
SELECT CASE
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_catalog.pg_last_wal_receive_lsn() - '0/0')::pg_catalog.FLOAT8
WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
ELSE 0
END AS lsn;

View File

@@ -1 +1 @@
SELECT subenabled::pg_catalog.text AS enabled, pg_catalog.count(*) AS subscriptions_count FROM pg_catalog.pg_subscription GROUP BY subenabled;
SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled;

View File

@@ -1 +1 @@
SELECT datname, state, pg_catalog.count(*) AS count FROM pg_catalog.pg_stat_activity WHERE state <> '' GROUP BY datname, state;
SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state;

View File

@@ -1,5 +1,5 @@
SELECT pg_catalog.sum(pg_catalog.pg_database_size(datname)) AS total
FROM pg_catalog.pg_database
SELECT sum(pg_database_size(datname)) AS total
FROM pg_database
-- Ignore invalid databases, as we will likely have problems with
-- getting their size from the Pageserver.
WHERE datconnlimit != -2;

View File

@@ -3,6 +3,6 @@
-- minutes.
SELECT
x::pg_catalog.text AS duration_seconds,
x::text as duration_seconds,
neon.approximate_working_set_size_seconds(x) AS size
FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x);

View File

@@ -3,6 +3,6 @@
SELECT
x AS duration,
neon.approximate_working_set_size_seconds(extract('epoch' FROM x::pg_catalog.interval)::pg_catalog.int4) AS size FROM (
neon.approximate_working_set_size_seconds(extract('epoch' FROM x::interval)::int) AS size FROM (
VALUES ('5m'), ('15m'), ('1h')
) AS t (x);

View File

@@ -1 +1 @@
SELECT pg_catalog.pg_size_bytes(pg_catalog.current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit;
SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit;

View File

@@ -1,3 +1,3 @@
SELECT slot_name, (restart_lsn - '0/0')::pg_catalog.FLOAT8 AS restart_lsn
FROM pg_catalog.pg_replication_slots
SELECT slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
FROM pg_replication_slots
WHERE slot_type = 'logical';

View File

@@ -1 +1 @@
SELECT setting::pg_catalog.int4 AS max_cluster_size FROM pg_catalog.pg_settings WHERE name = 'neon.max_cluster_size';
SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size';

View File

@@ -1,13 +1,13 @@
-- We export stats for 10 non-system databases. Without this limit it is too
-- easy to abuse the system by creating lots of databases.
SELECT pg_catalog.pg_database_size(datname) AS db_size,
SELECT pg_database_size(datname) AS db_size,
deadlocks,
tup_inserted AS inserted,
tup_updated AS updated,
tup_deleted AS deleted,
datname
FROM pg_catalog.pg_stat_database
FROM pg_stat_database
WHERE datname IN (
SELECT datname FROM pg_database
-- Ignore invalid databases, as we will likely have problems with

View File

@@ -3,4 +3,4 @@
-- replay LSN may have advanced past the receive LSN we are using for the
-- calculation.
SELECT GREATEST(0, pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_last_wal_receive_lsn(), pg_catalog.pg_last_wal_replay_lsn())) AS replication_delay_bytes;
SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;

View File

@@ -1,5 +1,5 @@
SELECT
CASE
WHEN pg_catalog.pg_last_wal_receive_lsn() = pg_catalog.pg_last_wal_replay_lsn() THEN 0
ELSE GREATEST(0, EXTRACT (EPOCH FROM pg_catalog.now() - pg_catalog.pg_last_xact_replay_timestamp()))
WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
ELSE GREATEST(0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
END AS replication_delay_seconds;

View File

@@ -1,10 +1,10 @@
SELECT
slot_name,
pg_catalog.pg_wal_lsn_diff(
pg_wal_lsn_diff(
CASE
WHEN pg_catalog.pg_is_in_recovery() THEN pg_catalog.pg_last_wal_replay_lsn()
ELSE pg_catalog.pg_current_wal_lsn()
WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn()
ELSE pg_current_wal_lsn()
END,
restart_lsn)::pg_catalog.FLOAT8 AS retained_wal
FROM pg_catalog.pg_replication_slots
restart_lsn)::FLOAT8 AS retained_wal
FROM pg_replication_slots
WHERE active = false;

View File

@@ -4,4 +4,4 @@ SELECT
WHEN wal_status = 'lost' THEN 1
ELSE 0
END AS wal_is_lost
FROM pg_catalog.pg_replication_slots;
FROM pg_replication_slots;

View File

@@ -57,6 +57,9 @@ stateDiagram-v2
RefreshConfigurationPending --> RefreshConfiguration: Received compute spec and started configuration
RefreshConfiguration --> Running : Compute has been re-configured
RefreshConfiguration --> RefreshConfigurationPending : Configuration failed and to be retried
Running --> Reloading : Local changes (TLS certificate renewal) were detected and postgres is being reloaded
Reloading --> Running : Postgres was reloaded
Reloading --> Failed : Failed to reload postgres
TerminationPendingFast --> Terminated compute with 30s delay for cplane to inspect status
TerminationPendingImmediate --> Terminated : Terminated compute immediately
Failed --> RefreshConfigurationPending : Received a /refresh_configuration request

View File

@@ -279,7 +279,7 @@ fn main() -> Result<()> {
config,
)?;
let exit_code = compute_node.run().context("running compute node")?;
let exit_code = compute_node.run()?;
scenario.teardown();

View File

@@ -24,9 +24,9 @@ pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
});
let query = "
INSERT INTO public.health_check VALUES (1, pg_catalog.now())
INSERT INTO health_check VALUES (1, now())
ON CONFLICT (id) DO UPDATE
SET updated_at = pg_catalog.now();";
SET updated_at = now();";
match client.simple_query(query).await {
Result::Ok(result) => {

View File

@@ -28,7 +28,7 @@ use std::path::Path;
use std::process::{Command, Stdio};
use std::str::FromStr;
use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
use std::sync::{Arc, Condvar, Mutex, RwLock};
use std::sync::{Arc, Condvar, Mutex, MutexGuard, RwLock};
use std::time::{Duration, Instant};
use std::{env, fs};
use tokio::{spawn, sync::watch, task::JoinHandle, time};
@@ -57,7 +57,6 @@ use crate::rsyslog::{
use crate::spec::*;
use crate::swap::resize_swap;
use crate::sync_sk::{check_if_synced, ping_safekeeper};
use crate::tls::watch_cert_for_changes;
use crate::{config, extension_server, local_proxy};
pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
@@ -583,7 +582,7 @@ impl ComputeNode {
// that can affect `compute_ctl` and prevent it from properly configuring the database schema.
// Unset them via connection string options before connecting to the database.
// N.B. keep it in sync with `ZENITH_OPTIONS` in `get_maintenance_client()`.
const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0 -c pgaudit.log=none";
const EXTRA_OPTIONS: &str = "-c role=cloud_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0 -c pgaudit.log=none";
let options = match conn_conf.get_options() {
// Allow the control plane to override any options set by the
// compute
@@ -842,14 +841,11 @@ impl ComputeNode {
let mut pre_tasks = tokio::task::JoinSet::new();
// Make sure TLS certificates are properly loaded and in the right place.
if self.compute_ctl_config.tls.is_some() {
let tls_task = self.compute_ctl_config.tls.as_ref().map(|tls_config| {
let this = self.clone();
pre_tasks.spawn(async move {
this.watch_cert_for_changes().await;
Ok::<(), anyhow::Error>(())
});
}
let tls_config = tls_config.clone();
tokio::task::spawn_blocking(|| this.watch_cert_for_changes(tls_config))
});
let tls_config = self.tls_config(&pspec.spec);
@@ -904,6 +900,13 @@ impl ComputeNode {
});
}
// Wait for TLS certificates to be issued before updating pgbouncer and local proxy.
let rt = tokio::runtime::Handle::current();
if let Some(tls_task) = tls_task {
rt.block_on(tls_task)
.context("TLS certificate renewal task panicked")?;
}
// tune pgbouncer
if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
info!("tuning pgbouncer");
@@ -986,7 +989,6 @@ impl ComputeNode {
let _configurator_handle = launch_configurator(self);
// Wait for all the pre-tasks to finish before starting postgres
let rt = tokio::runtime::Handle::current();
while let Some(res) = rt.block_on(pre_tasks.join_next()) {
res??;
}
@@ -1884,7 +1886,7 @@ impl ComputeNode {
// It doesn't matter what were the options before, here we just want
// to connect and create a new superuser role.
const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path='' -c statement_timeout=0";
const ZENITH_OPTIONS: &str = "-c role=zenith_admin -c default_transaction_read_only=off -c search_path=public -c statement_timeout=0";
zenith_admin_conf.options(ZENITH_OPTIONS);
let mut client =
@@ -1949,10 +1951,7 @@ impl ComputeNode {
.clone(),
);
let mut tls_config = None::<TlsConfig>;
if spec.features.contains(&ComputeFeature::TlsExperimental) {
tls_config = self.compute_ctl_config.tls.clone();
}
let tls_config = self.tls_config(&spec);
self.update_installed_extensions_collection_interval(&spec);
@@ -2134,6 +2133,60 @@ impl ComputeNode {
Ok(())
}
/// Tell postgres/pgbouncer/local_proxy to reload their configurations.
#[instrument(skip_all)]
pub fn reload(&self, spec: ComputeSpec) -> Result<()> {
let rt = tokio::runtime::Handle::current();
if spec.pgbouncer_settings.is_some() {
rt.block_on(reload_pgbouncer())?;
}
if spec.local_proxy_config.is_some() {
local_proxy::reload()?;
}
self.pg_reload_conf()?;
let unknown_op = "unknown".to_string();
let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
info!("finished reload of compute node for operation {op_id}");
Ok(())
}
/// Acquire the "reloading" lock while running the supplied function.
///
/// This ensures that this thread is the only thread that
/// can issue signals to postgres.
///
/// If the supplied function errors, the compute status is marked as failed.
pub fn lock_while_reloading<T>(
&self,
mut state: MutexGuard<'_, ComputeState>,
f: impl FnOnce(ComputeSpec) -> Result<T>,
) -> Result<T> {
let old_status = state.status;
// transition to the reloading state.
state.set_status(ComputeStatus::Reloading, &self.state_changed);
let spec = state.pspec.as_ref().unwrap().spec.clone();
// unlock while reloading, so we don't block other tasks.
drop(state);
let res = f(spec);
let new_status = if res.is_ok() {
old_status
} else {
ComputeStatus::Failed
};
let mut state = self.state.lock().unwrap();
// make sure our invariants are upheld
assert_eq!(state.status, ComputeStatus::Reloading);
state.set_status(new_status, &self.state_changed);
res
}
#[instrument(skip_all)]
pub fn configure_as_primary(&self, compute_state: &ComputeState) -> Result<()> {
let pspec = compute_state.pspec.as_ref().expect("spec must be set");
@@ -2168,57 +2221,103 @@ impl ComputeNode {
Ok(())
}
pub async fn watch_cert_for_changes(self: Arc<Self>) {
// update status on cert renewal
if let Some(tls_config) = &self.compute_ctl_config.tls {
let tls_config = tls_config.clone();
pub fn watch_cert_for_changes(self: Arc<Self>, tls_config: TlsConfig) {
// wait until the cert exists.
let mut digest = crate::tls::compute_digest(&tls_config.cert_path);
info!(
cert_path = tls_config.cert_path,
key_path = tls_config.key_path,
"TLS certificates found"
);
// wait until the cert exists.
let mut cert_watch = watch_cert_for_changes(tls_config.cert_path.clone()).await;
// ensure the keys are saved before continuing.
let key_pair = crate::tls::load_certs_blocking(&tls_config);
while let Err(e) =
crate::tls::update_key_path_blocking(Path::new(&self.params.pgdata), &key_pair)
{
error!("could not save TLS certificates: {e}");
std::thread::sleep(Duration::from_millis(20));
}
tokio::task::spawn_blocking(move || {
let handle = tokio::runtime::Handle::current();
'cert_update: loop {
// let postgres/pgbouncer/local_proxy know the new cert/key exists.
// we need to wait until it's configurable first.
tokio::task::spawn_blocking(move || {
'cert_update: loop {
// wait for a new certificate update
let new_digest = crate::tls::wait_until_cert_changed(digest, &tls_config.cert_path);
let mut state = self.state.lock().unwrap();
'status_update: loop {
match state.status {
// let's update the state to config pending
ComputeStatus::ConfigurationPending | ComputeStatus::Running => {
state.set_status(
ComputeStatus::ConfigurationPending,
&self.state_changed,
);
break 'status_update;
}
// load the corresponding keys
let key_pair = crate::tls::load_certs_blocking(&tls_config);
// exit loop
ComputeStatus::Failed
| ComputeStatus::TerminationPendingFast
| ComputeStatus::TerminationPendingImmediate
| ComputeStatus::Terminated => break 'cert_update,
// let postgres/pgbouncer/local_proxy know the new cert/key exists.
// we need to wait until it's configurable first.
// wait
ComputeStatus::Init
| ComputeStatus::Configuration
| ComputeStatus::RefreshConfiguration
| ComputeStatus::RefreshConfigurationPending
| ComputeStatus::Empty => {
state = self.state_changed.wait(state).unwrap();
}
let mut state = self.state.lock().unwrap();
'status_update: loop {
match state.status {
// let's update the state to config pending
ComputeStatus::Running => {
info!("reloading compute due to TLS certificate renewal");
break 'status_update;
}
// exit loop
ComputeStatus::Failed
| ComputeStatus::TerminationPendingFast
| ComputeStatus::TerminationPendingImmediate
| ComputeStatus::Terminated => break 'cert_update,
// wait
ComputeStatus::Init
| ComputeStatus::Configuration
| ComputeStatus::ConfigurationPending
| ComputeStatus::RefreshConfiguration
| ComputeStatus::RefreshConfigurationPending
| ComputeStatus::Reloading
| ComputeStatus::Empty => {
state = self.state_changed.wait(state).unwrap();
}
}
drop(state);
}
// wait for a new certificate update
if handle.block_on(cert_watch.changed()).is_err() {
break;
let result = self.lock_while_reloading(state, |spec| {
// ensure the keys are saved before continuing.
// we do this while holding the 'reloading' state so that we know we're not interfering with any
// active configuration stages.
if let Err(e) = crate::tls::update_key_path_blocking(
Path::new(&self.params.pgdata),
&key_pair,
) {
return Ok(Err(e));
}
// reload postgres/pgbouncer/local_proxy to pick up our new certificates.
self.reload(spec)?;
Ok(Ok(()))
});
match result {
// Reload failed. Compute is in a bad state.
Err(e) => {
error!("could not reload compute node: {}", e);
return;
}
// Updating the certificates failed. Retry
Ok(Err(e)) => {
error!("could not save TLS certificates: {e}");
std::thread::sleep(Duration::from_millis(20));
}
// Successful. Acknowledge that we've saved these certificates.
Ok(Ok(())) => {
digest = new_digest;
info!(
cert_path = tls_config.cert_path,
key_path = tls_config.key_path,
"TLS certificates renewed",
);
}
}
});
}
}
});
}
pub fn tls_config(&self, spec: &ComputeSpec) -> &Option<TlsConfig> {
@@ -2339,13 +2438,13 @@ impl ComputeNode {
let result = client
.simple_query(
"SELECT
pg_catalog.row_to_json(pss)
row_to_json(pg_stat_statements)
FROM
public.pg_stat_statements pss
pg_stat_statements
WHERE
pss.userid != 'cloud_admin'::pg_catalog.regrole::pg_catalog.oid
userid != 'cloud_admin'::regrole::oid
ORDER BY
(pss.mean_exec_time + pss.mean_plan_time) DESC
(mean_exec_time + mean_plan_time) DESC
LIMIT 100",
)
.await;
@@ -2473,11 +2572,11 @@ LIMIT 100",
// check the role grants first - to gracefully handle read-replicas.
let select = "SELECT privilege_type
FROM pg_catalog.pg_namespace
JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) AS acl ON true
JOIN pg_catalog.pg_user users ON acl.grantee = users.usesysid
WHERE users.usename OPERATOR(pg_catalog.=) $1::pg_catalog.name
AND nspname OPERATOR(pg_catalog.=) $2::pg_catalog.name";
FROM pg_namespace
JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true
JOIN pg_user users ON acl.grantee = users.usesysid
WHERE users.usename = $1
AND nspname = $2";
let rows = db_client
.query(select, &[role_name, schema_name])
.await
@@ -2546,9 +2645,8 @@ LIMIT 100",
.await
.with_context(|| format!("Failed to execute query: {query}"))?;
} else {
let query = format!(
"CREATE EXTENSION IF NOT EXISTS {ext_name} WITH SCHEMA public VERSION {quoted_version}"
);
let query =
format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}");
db_client
.simple_query(&query)
.await

View File

@@ -78,7 +78,7 @@ impl ComputeNode {
const RETRIES: i32 = 20;
for i in 0..=RETRIES {
let row = client
.query_one("SELECT pg_catalog.pg_last_wal_replay_lsn()", &[])
.query_one("SELECT pg_last_wal_replay_lsn()", &[])
.await
.context("getting last replay lsn")?;
let lsn: u64 = row.get::<usize, postgres_types::PgLsn>(0).into();
@@ -103,7 +103,7 @@ impl ComputeNode {
.await
.context("setting safekeepers")?;
client
.query("SELECT pg_catalog.pg_reload_conf()", &[])
.query("SELECT pg_reload_conf()", &[])
.await
.context("reloading postgres config")?;
@@ -113,7 +113,7 @@ impl ComputeNode {
});
let row = client
.query_one("SELECT * FROM pg_catalog.pg_promote()", &[])
.query_one("SELECT * FROM pg_promote()", &[])
.await
.context("pg_promote")?;
if !row.get::<usize, bool>(0) {

View File

@@ -16,7 +16,7 @@ use crate::pg_helpers::{
DatabricksSettingsExt as _, GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize,
escape_conf_value,
};
use crate::tls::{self, SERVER_CRT, SERVER_KEY};
use crate::tls::{SERVER_CRT, SERVER_KEY};
use utils::shard::{ShardIndex, ShardNumber};
@@ -65,21 +65,15 @@ pub fn write_postgres_conf(
writeln!(file, "{conf}")?;
}
// Stripe size GUC should be defined prior to connection string
if let Some(stripe_size) = spec.shard_stripe_size {
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
// Add options for connecting to storage
writeln!(file, "# Neon storage settings")?;
writeln!(file)?;
if let Some(conninfo) = &spec.pageserver_connection_info {
// Stripe size GUC should be defined prior to connection string
if let Some(stripe_size) = conninfo.stripe_size {
writeln!(
file,
"# from compute spec's pageserver_conninfo.stripe_size field"
)?;
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
let mut libpq_urls: Option<Vec<String>> = Some(Vec::new());
let mut grpc_urls: Option<Vec<String>> = Some(Vec::new());
let num_shards = if conninfo.shard_count.0 == 0 {
1 // unsharded, treat it as a single shard
} else {
@@ -112,14 +106,6 @@ pub fn write_postgres_conf(
} else {
libpq_urls = None
}
// Similarly for gRPC URLs
if let Some(url) = &first_pageserver.grpc_url {
if let Some(ref mut urls) = grpc_urls {
urls.push(url.clone());
}
} else {
grpc_urls = None
}
}
if let Some(libpq_urls) = libpq_urls {
writeln!(
@@ -131,34 +117,27 @@ pub fn write_postgres_conf(
"neon.pageserver_connstring={}",
escape_conf_value(&libpq_urls.join(","))
)?;
writeln!(file, "neon.use_communicator_worker=false")?;
} else {
writeln!(file, "# no neon.pageserver_connstring")?;
}
if let Some(grpc_urls) = grpc_urls {
if let Some(stripe_size) = conninfo.stripe_size {
writeln!(
file,
"# derived from compute spec's pageserver_conninfo field"
"# from compute spec's pageserver_conninfo.stripe_size field"
)?;
writeln!(
file,
"neon.pageserver_grpc_urls={}",
escape_conf_value(&grpc_urls.join(","))
)?;
writeln!(file, "neon.use_communicator_worker=true")?;
} else {
writeln!(file, "# no neon.pageserver_grpc_urls")?;
}
} else {
// Stripe size GUC should be defined prior to connection string
if let Some(stripe_size) = spec.shard_stripe_size {
writeln!(file, "# from compute spec's shard_stripe_size field")?;
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
} else {
if let Some(s) = &spec.pageserver_connstring {
writeln!(file, "# from compute spec's pageserver_connstring field")?;
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
}
if let Some(stripe_size) = spec.shard_stripe_size {
writeln!(file, "# from compute spec's shard_stripe_size field")?;
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
}
if !spec.safekeeper_connstrings.is_empty() {
@@ -199,14 +178,9 @@ pub fn write_postgres_conf(
}
// tls
if let Some(tls_config) = tls_config {
if tls_config.is_some() {
writeln!(file, "ssl = on")?;
// postgres requires the keyfile to be in a secure file,
// currently too complicated to ensure that at the VM level,
// so we just copy them to another file instead. :shrug:
tls::update_key_path_blocking(pgdata_path, tls_config);
// these are the default, but good to be explicit.
writeln!(file, "ssl_cert_file = '{SERVER_CRT}'")?;
writeln!(file, "ssl_key_file = '{SERVER_KEY}'")?;

View File

@@ -12,8 +12,10 @@ use crate::http::JsonResponse;
/// Check that the compute is currently running.
pub(in crate::http) async fn is_writable(State(compute): State<Arc<ComputeNode>>) -> Response {
let status = compute.get_status();
if status != ComputeStatus::Running {
return JsonResponse::invalid_status(status);
match status {
// If we are running, or just reloading the config, we are ok to write a new config.
ComputeStatus::Running | ComputeStatus::Reloading => {}
_ => return JsonResponse::invalid_status(status),
}
match check_writability(&compute).await {

View File

@@ -27,32 +27,6 @@ pub(in crate::http) async fn configure(
Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
};
// XXX: wrap state update under lock in a code block. Otherwise, we will try
// to `Send` `mut state` into the spawned thread bellow, which will cause
// the following rustc error:
//
// error: future cannot be sent between threads safely
{
let mut state = compute.state.lock().unwrap();
if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
return JsonResponse::invalid_status(state.status);
}
// Pass the tracing span to the main thread that performs the startup,
// so that the start_compute operation is considered a child of this
// configure request for tracing purposes.
state.startup_span = Some(tracing::Span::current());
if compute.params.lakebase_mode {
ComputeNode::set_spec(&compute.params, &mut state, pspec);
} else {
state.pspec = Some(pspec);
}
state.set_status(ComputeStatus::ConfigurationPending, &compute.state_changed);
drop(state);
}
// Spawn a blocking thread to wait for compute to become Running. This is
// needed to not block the main pool of workers and to be able to serve
// other requests while some particular request is waiting for compute to
@@ -60,6 +34,32 @@ pub(in crate::http) async fn configure(
let c = compute.clone();
let completed = task::spawn_blocking(move || {
let mut state = c.state.lock().unwrap();
loop {
match state.status {
// ideal state.
ComputeStatus::Empty | ComputeStatus::Running => break,
// we need to wait until reloaded
ComputeStatus::Reloading => {
state = c.state_changed.wait(state).unwrap();
}
// All other cases are unexpected.
_ => return Err(JsonResponse::invalid_status(state.status)),
}
}
// Pass the tracing span to the main thread that performs the startup,
// so that the start_compute operation is considered a child of this
// configure request for tracing purposes.
state.startup_span = Some(tracing::Span::current());
if c.params.lakebase_mode {
ComputeNode::set_spec(&c.params, &mut state, pspec);
} else {
state.pspec = Some(pspec);
}
state.set_status(ComputeStatus::ConfigurationPending, &c.state_changed);
while state.status != ComputeStatus::Running {
state = c.state_changed.wait(state).unwrap();
info!(
@@ -71,7 +71,7 @@ pub(in crate::http) async fn configure(
if state.status == ComputeStatus::Failed {
let err = state.error.as_ref().map_or("unknown error", |x| x);
let msg = format!("compute configuration failed: {err:?}");
return Err(msg);
return Err(JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, msg));
}
}
@@ -81,7 +81,7 @@ pub(in crate::http) async fn configure(
.unwrap();
if let Err(e) = completed {
return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, e);
return e;
}
// Return current compute state if everything went well.

View File

@@ -19,7 +19,7 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>, PostgresError> {
.query(
"SELECT datname FROM pg_catalog.pg_database
WHERE datallowconn
AND datconnlimit OPERATOR(pg_catalog.<>) (OPERATOR(pg_catalog.-) 2::pg_catalog.int4)
AND datconnlimit <> - 2
LIMIT 500",
&[],
)
@@ -67,7 +67,7 @@ pub async fn get_installed_extensions(
let extensions: Vec<(String, String, i32)> = client
.query(
"SELECT extname, extversion, extowner::pg_catalog.int4 FROM pg_catalog.pg_extension",
"SELECT extname, extversion, extowner::integer FROM pg_catalog.pg_extension",
&[],
)
.await?

View File

@@ -11,9 +11,11 @@ use utils::pid_file::{self, PidFileRead};
pub fn configure(local_proxy: &LocalProxySpec) -> Result<()> {
write_local_proxy_conf("/etc/local_proxy/config.json".as_ref(), local_proxy)?;
notify_local_proxy("/etc/local_proxy/pid".as_ref())?;
reload()
}
Ok(())
pub fn reload() -> Result<()> {
notify_local_proxy("/etc/local_proxy/pid".as_ref())
}
/// Create or completely rewrite configuration file specified by `path`

View File

@@ -76,7 +76,7 @@ impl<'m> MigrationRunner<'m> {
self.client
.simple_query("CREATE SCHEMA IF NOT EXISTS neon_migration")
.await?;
self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key pg_catalog.int4 NOT NULL PRIMARY KEY, id pg_catalog.int8 NOT NULL DEFAULT 0)").await?;
self.client.simple_query("CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)").await?;
self.client
.simple_query(
"INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING",

View File

@@ -15,17 +15,17 @@ DO $$
DECLARE
role_name text;
BEGIN
FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member')
FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, '{privileged_role_name}', 'member')
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', pg_catalog.quote_ident(role_name);
EXECUTE pg_catalog.format('ALTER ROLE %I INHERIT;', role_name);
RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
END LOOP;
FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles
FOR role_name IN SELECT rolname FROM pg_roles
WHERE
NOT pg_catalog.pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT pg_catalog.starts_with(rolname, 'pg_')
NOT pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT starts_with(rolname, 'pg_')
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', pg_catalog.quote_ident(role_name);
EXECUTE pg_catalog.format('ALTER ROLE %I NOBYPASSRLS;', role_name);
RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
END LOOP;
END $$;

View File

@@ -1,6 +1,6 @@
DO $$
BEGIN
IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name = 'server_version_num') THEN
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
EXECUTE 'GRANT pg_create_subscription TO {privileged_role_name}';
END IF;
END $$;

View File

@@ -5,9 +5,9 @@ DO $$
DECLARE
role_name TEXT;
BEGIN
FOR role_name IN SELECT rolname FROM pg_catalog.pg_roles WHERE rolreplication IS TRUE
FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
LOOP
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', pg_catalog.quote_ident(role_name);
EXECUTE pg_catalog.format('ALTER ROLE %I NOREPLICATION;', role_name);
RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
END LOOP;
END $$;

View File

@@ -1,6 +1,6 @@
DO $$
BEGIN
IF (SELECT setting::pg_catalog.numeric >= 160000 FROM pg_catalog.pg_settings WHERE name OPERATOR(pg_catalog.=) 'server_version_num'::pg_catalog.text) THEN
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO {privileged_role_name}';
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO {privileged_role_name}';
END IF;

View File

@@ -2,7 +2,7 @@ DO $$
DECLARE
bypassrls boolean;
BEGIN
SELECT rolbypassrls INTO bypassrls FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser';
SELECT rolbypassrls INTO bypassrls FROM pg_roles WHERE rolname = 'neon_superuser';
IF NOT bypassrls THEN
RAISE EXCEPTION 'neon_superuser cannot bypass RLS';
END IF;

View File

@@ -4,8 +4,8 @@ DECLARE
BEGIN
FOR role IN
SELECT rolname AS name, rolinherit AS inherit
FROM pg_catalog.pg_roles
WHERE pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member')
FROM pg_roles
WHERE pg_has_role(rolname, 'neon_superuser', 'member')
LOOP
IF NOT role.inherit THEN
RAISE EXCEPTION '% cannot inherit', quote_ident(role.name);
@@ -14,12 +14,12 @@ BEGIN
FOR role IN
SELECT rolname AS name, rolbypassrls AS bypassrls
FROM pg_catalog.pg_roles
WHERE NOT pg_catalog.pg_has_role(rolname, 'neon_superuser', 'member')
AND NOT pg_catalog.starts_with(rolname, 'pg_')
FROM pg_roles
WHERE NOT pg_has_role(rolname, 'neon_superuser', 'member')
AND NOT starts_with(rolname, 'pg_')
LOOP
IF role.bypassrls THEN
RAISE EXCEPTION '% can bypass RLS', pg_catalog.quote_ident(role.name);
RAISE EXCEPTION '% can bypass RLS', quote_ident(role.name);
END IF;
END LOOP;
END $$;

View File

@@ -1,10 +1,10 @@
DO $$
BEGIN
IF (SELECT pg_catalog.current_setting('server_version_num')::pg_catalog.numeric < 160000) THEN
IF (SELECT current_setting('server_version_num')::numeric < 160000) THEN
RETURN;
END IF;
IF NOT (SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN
IF NOT (SELECT pg_has_role('neon_superuser', 'pg_create_subscription', 'member')) THEN
RAISE EXCEPTION 'neon_superuser cannot execute pg_create_subscription';
END IF;
END $$;

View File

@@ -2,12 +2,12 @@ DO $$
DECLARE
monitor record;
BEGIN
SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member,
SELECT pg_has_role('neon_superuser', 'pg_monitor', 'member') AS member,
admin_option AS admin
INTO monitor
FROM pg_catalog.pg_auth_members
WHERE roleid = 'pg_monitor'::pg_catalog.regrole
AND member = 'neon_superuser'::pg_catalog.regrole;
FROM pg_auth_members
WHERE roleid = 'pg_monitor'::regrole
AND member = 'neon_superuser'::regrole;
IF monitor IS NULL THEN
RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor';

View File

@@ -2,11 +2,11 @@ DO $$
DECLARE
can_execute boolean;
BEGIN
SELECT pg_catalog.bool_and(pg_catalog.has_function_privilege('neon_superuser', oid, 'execute'))
SELECT bool_and(has_function_privilege('neon_superuser', oid, 'execute'))
INTO can_execute
FROM pg_catalog.pg_proc
FROM pg_proc
WHERE proname IN ('pg_export_snapshot', 'pg_log_standby_snapshot')
AND pronamespace = 'pg_catalog'::pg_catalog.regnamespace;
AND pronamespace = 'pg_catalog'::regnamespace;
IF NOT can_execute THEN
RAISE EXCEPTION 'neon_superuser cannot execute both pg_export_snapshot and pg_log_standby_snapshot';
END IF;

View File

@@ -2,9 +2,9 @@ DO $$
DECLARE
can_execute boolean;
BEGIN
SELECT pg_catalog.has_function_privilege('neon_superuser', oid, 'execute')
SELECT has_function_privilege('neon_superuser', oid, 'execute')
INTO can_execute
FROM pg_catalog.pg_proc
FROM pg_proc
WHERE proname = 'pg_show_replication_origin_status'
AND pronamespace = 'pg_catalog'::regnamespace;
IF NOT can_execute THEN

View File

@@ -2,10 +2,10 @@ DO $$
DECLARE
signal_backend record;
BEGIN
SELECT pg_catalog.pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
admin_option AS admin
INTO signal_backend
FROM pg_catalog.pg_auth_members
FROM pg_auth_members
WHERE roleid = 'pg_signal_backend'::regrole
AND member = 'neon_superuser'::regrole;

View File

@@ -407,9 +407,9 @@ fn get_database_stats(cli: &mut Client) -> anyhow::Result<(f64, i64)> {
// like `postgres_exporter` use it to query Postgres statistics.
// Use explicit 8 bytes type casts to match Rust types.
let stats = cli.query_one(
"SELECT pg_catalog.coalesce(pg_catalog.sum(active_time), 0.0)::pg_catalog.float8 AS total_active_time,
pg_catalog.coalesce(pg_catalog.sum(sessions), 0)::pg_catalog.bigint AS total_sessions
FROM pg_catalog.pg_stat_database
"SELECT coalesce(sum(active_time), 0.0)::float8 AS total_active_time,
coalesce(sum(sessions), 0)::bigint AS total_sessions
FROM pg_stat_database
WHERE datname NOT IN (
'postgres',
'template0',
@@ -445,11 +445,11 @@ fn get_backends_state_change(cli: &mut Client) -> anyhow::Result<Option<DateTime
let mut last_active: Option<DateTime<Utc>> = None;
// Get all running client backends except ourself, use RFC3339 DateTime format.
let backends = cli.query(
"SELECT state, pg_catalog.to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"'::pg_catalog.text) AS state_change
"SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change
FROM pg_stat_activity
WHERE backend_type OPERATOR(pg_catalog.=) 'client backend'::pg_catalog.text
AND pid OPERATOR(pg_catalog.!=) pg_catalog.pg_backend_pid()
AND usename OPERATOR(pg_catalog.!=) 'cloud_admin'::pg_catalog.name;", // XXX: find a better way to filter other monitors?
WHERE backend_type = 'client backend'
AND pid != pg_backend_pid()
AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
&[],
);

View File

@@ -299,9 +299,9 @@ pub async fn get_existing_dbs_async(
.query_raw::<str, &String, &[String; 0]>(
"SELECT
datname AS name,
(SELECT rolname FROM pg_catalog.pg_roles WHERE oid OPERATOR(pg_catalog.=) datdba) AS owner,
(SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner,
NOT datallowconn AS restrict_conn,
datconnlimit OPERATOR(pg_catalog.=) (OPERATOR(pg_catalog.-) 2) AS invalid
datconnlimit = - 2 AS invalid
FROM
pg_catalog.pg_database;",
&[],
@@ -466,13 +466,7 @@ fn update_pgbouncer_ini(
Ok(())
}
/// Tune pgbouncer.
/// 1. Apply new config using pgbouncer admin console
/// 2. Add new values to pgbouncer.ini to preserve them after restart
pub async fn tune_pgbouncer(
mut pgbouncer_config: IndexMap<String, String>,
tls_config: Option<TlsConfig>,
) -> Result<()> {
async fn connect() -> Result<tokio_postgres::Client> {
let pgbouncer_connstr = if std::env::var_os("AUTOSCALING").is_some() {
// for VMs use pgbouncer specific way to connect to
// pgbouncer admin console without password
@@ -518,18 +512,17 @@ pub async fn tune_pgbouncer(
}
};
if let Some(tls_config) = tls_config {
// pgbouncer starts in a half-ok state if it cannot find these files.
// It will default to client_tls_sslmode=deny, which causes proxy to error.
// There is a small window at startup where these files don't yet exist in the VM.
// Best to wait until it exists.
loop {
if let Ok(true) = tokio::fs::try_exists(&tls_config.key_path).await {
break;
}
tokio::time::sleep(Duration::from_millis(500)).await
}
Ok(client)
}
/// Tune pgbouncer.
/// 1. Apply new config to pgbouncer.ini
/// 2. Notify pgbouncer to reload
pub async fn tune_pgbouncer(
mut pgbouncer_config: IndexMap<String, String>,
tls_config: Option<TlsConfig>,
) -> Result<()> {
if let Some(tls_config) = tls_config {
pgbouncer_config.insert("client_tls_cert_file".to_string(), tls_config.cert_path);
pgbouncer_config.insert("client_tls_key_file".to_string(), tls_config.key_path);
pgbouncer_config.insert("client_tls_sslmode".to_string(), "allow".to_string());
@@ -550,10 +543,17 @@ pub async fn tune_pgbouncer(
info!("Applying pgbouncer setting change");
reload_pgbouncer().await
}
/// Reload pgbouncer.
pub async fn reload_pgbouncer() -> Result<()> {
let client = connect().await?;
if let Err(err) = client.simple_query("RELOAD").await {
// Don't fail on error, just print it into log
error!("Failed to apply pgbouncer setting change, {err}",);
};
error!("Failed to apply pgbouncer setting change: {err}",);
}
Ok(())
}

View File

@@ -82,7 +82,7 @@ impl ComputeNode {
info!("Checking if drop subscription operation was already performed for timeline_id: {}", timeline_id);
drop_subscriptions_done = match
client.query("select 1 from neon.drop_subscriptions_done where timeline_id OPERATOR(pg_catalog.=) $1", &[&timeline_id.to_string()]).await {
client.query("select 1 from neon.drop_subscriptions_done where timeline_id = $1", &[&timeline_id.to_string()]).await {
Ok(result) => !result.is_empty(),
Err(e) =>
{
@@ -1142,9 +1142,7 @@ async fn get_operations<'a>(
if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
if libs.contains("pg_stat_statements") {
return Ok(Box::new(once(Operation {
query: String::from(
"CREATE EXTENSION IF NOT EXISTS pg_stat_statements WITH SCHEMA public",
),
query: String::from("CREATE EXTENSION IF NOT EXISTS pg_stat_statements"),
comment: Some(String::from("create system extensions")),
})));
}
@@ -1152,13 +1150,11 @@ async fn get_operations<'a>(
Ok(Box::new(empty()))
}
ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation {
query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit WITH SCHEMA public"),
query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"),
comment: Some(String::from("create pgaudit extensions")),
}))),
ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation {
query: String::from(
"CREATE EXTENSION IF NOT EXISTS pgauditlogtofile WITH SCHEMA public",
),
query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"),
comment: Some(String::from("create pgauditlogtofile extensions")),
}))),
// Disable pgaudit logging for postgres database.
@@ -1182,7 +1178,7 @@ async fn get_operations<'a>(
},
Operation {
query: String::from(
"UPDATE pg_catalog.pg_extension SET extrelocatable = true WHERE extname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name AND extrelocatable OPERATOR(pg_catalog.=) false",
"UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'",
),
comment: Some(String::from("compat/fix: make neon relocatable")),
},

View File

@@ -3,17 +3,16 @@ BEGIN
IF NOT EXISTS(
SELECT 1
FROM pg_catalog.pg_tables
WHERE tablename::pg_catalog.name OPERATOR(pg_catalog.=) 'health_check'::pg_catalog.name
AND schemaname::pg_catalog.name OPERATOR(pg_catalog.=) 'public'::pg_catalog.name
WHERE tablename = 'health_check'
)
THEN
CREATE TABLE public.health_check (
id pg_catalog.int4 primary key generated by default as identity,
updated_at pg_catalog.timestamptz default pg_catalog.now()
CREATE TABLE health_check (
id serial primary key,
updated_at timestamptz default now()
);
INSERT INTO public.health_check VALUES (1, pg_catalog.now())
INSERT INTO health_check VALUES (1, now())
ON CONFLICT (id) DO UPDATE
SET updated_at = pg_catalog.now();
SET updated_at = now();
END IF;
END
$$

View File

@@ -2,11 +2,10 @@ DO $$
DECLARE
query varchar;
BEGIN
FOR query IN
SELECT pg_catalog.format('ALTER FUNCTION %I(%s) OWNER TO {db_owner};', p.oid::regproc, pg_catalog.pg_get_function_identity_arguments(p.oid))
FROM pg_catalog.pg_proc p
WHERE p.pronamespace OPERATOR(pg_catalog.=) 'anon'::regnamespace::oid
LOOP
FOR query IN SELECT 'ALTER FUNCTION '||nsp.nspname||'.'||p.proname||'('||pg_get_function_identity_arguments(p.oid)||') OWNER TO {db_owner};'
FROM pg_proc p
JOIN pg_namespace nsp ON p.pronamespace = nsp.oid
WHERE nsp.nspname = 'anon' LOOP
EXECUTE query;
END LOOP;
END

View File

@@ -1,6 +1,6 @@
DO $$
BEGIN
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname OPERATOR(pg_catalog.=) '{privileged_role_name}'::pg_catalog.name)
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{privileged_role_name}')
THEN
CREATE ROLE {privileged_role_name} {privileges} IN ROLE pg_read_all_data, pg_write_all_data;
END IF;

View File

@@ -4,14 +4,14 @@ $$
IF EXISTS(
SELECT nspname
FROM pg_catalog.pg_namespace
WHERE nspname OPERATOR(pg_catalog.=) 'public'
WHERE nspname = 'public'
) AND
pg_catalog.current_setting('server_version_num')::int OPERATOR(pg_catalog./) 10000 OPERATOR(pg_catalog.>=) 15
current_setting('server_version_num')::int / 10000 >= 15
THEN
IF EXISTS(
SELECT rolname
FROM pg_catalog.pg_roles
WHERE rolname OPERATOR(pg_catalog.=) 'web_access'
WHERE rolname = 'web_access'
)
THEN
GRANT CREATE ON SCHEMA public TO web_access;
@@ -20,7 +20,7 @@ $$
IF EXISTS(
SELECT nspname
FROM pg_catalog.pg_namespace
WHERE nspname OPERATOR(pg_catalog.=) 'public'
WHERE nspname = 'public'
)
THEN
ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;

View File

@@ -2,17 +2,11 @@ DO ${outer_tag}$
DECLARE
subname TEXT;
BEGIN
LOCK TABLE pg_catalog.pg_subscription IN ACCESS EXCLUSIVE MODE;
FOR subname IN
SELECT pg_subscription.subname
FROM pg_catalog.pg_subscription
WHERE subdbid OPERATOR(pg_catalog.=) (
SELECT oid FROM pg_database WHERE datname OPERATOR(pg_catalog.=) {datname_str}::pg_catalog.name
)
LOOP
EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I DISABLE;', subname);
EXECUTE pg_catalog.format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
EXECUTE pg_catalog.format('DROP SUBSCRIPTION %I;', subname);
LOCK TABLE pg_subscription IN ACCESS EXCLUSIVE MODE;
FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP
EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname);
EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
EXECUTE format('DROP SUBSCRIPTION %I;', subname);
END LOOP;
END;
${outer_tag}$;

View File

@@ -3,19 +3,19 @@ BEGIN
IF NOT EXISTS(
SELECT 1
FROM pg_catalog.pg_tables
WHERE tablename OPERATOR(pg_catalog.=) 'drop_subscriptions_done'::pg_catalog.name
AND schemaname OPERATOR(pg_catalog.=) 'neon'::pg_catalog.name
WHERE tablename = 'drop_subscriptions_done'
AND schemaname = 'neon'
)
THEN
CREATE TABLE neon.drop_subscriptions_done
(id pg_catalog.int4 primary key generated by default as identity, timeline_id pg_catalog.text);
(id serial primary key, timeline_id text);
END IF;
-- preserve the timeline_id of the last drop_subscriptions run
-- to ensure that the cleanup of a timeline is executed only once.
-- use upsert to avoid the table bloat in case of cascade branching (branch of a branch)
INSERT INTO neon.drop_subscriptions_done VALUES (1, pg_catalog.current_setting('neon.timeline_id'))
INSERT INTO neon.drop_subscriptions_done VALUES (1, current_setting('neon.timeline_id'))
ON CONFLICT (id) DO UPDATE
SET timeline_id = pg_catalog.current_setting('neon.timeline_id')::pg_catalog.text;
SET timeline_id = current_setting('neon.timeline_id');
END
$$

View File

@@ -15,15 +15,15 @@ BEGIN
WHERE schema_name IN ('public')
LOOP
FOR grantor IN EXECUTE
pg_catalog.format(
'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee OPERATOR(pg_catalog.=) %s',
format(
'SELECT DISTINCT rtg.grantor FROM information_schema.role_table_grants AS rtg WHERE grantee = %s',
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
quote_literal({role_name})
)
LOOP
EXECUTE pg_catalog.format('SET LOCAL ROLE %I', grantor);
EXECUTE format('SET LOCAL ROLE %I', grantor);
revoke_query := pg_catalog.format(
revoke_query := format(
'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY %I',
schema,
-- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`

View File

@@ -5,17 +5,17 @@ DO ${outer_tag}$
IF EXISTS(
SELECT nspname
FROM pg_catalog.pg_namespace
WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.name
WHERE nspname = 'public'
)
THEN
SELECT nspowner::regrole::text
FROM pg_catalog.pg_namespace
WHERE nspname OPERATOR(pg_catalog.=) 'public'::pg_catalog.text
WHERE nspname = 'public'
INTO schema_owner;
IF schema_owner OPERATOR(pg_catalog.=) 'cloud_admin'::pg_catalog.text OR schema_owner OPERATOR(pg_catalog.=) 'zenith_admin'::pg_catalog.text
IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'
THEN
EXECUTE pg_catalog.format('ALTER SCHEMA public OWNER TO %I', {db_owner});
EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner});
END IF;
END IF;
END

View File

@@ -3,10 +3,10 @@ DO ${outer_tag}$
IF EXISTS(
SELECT 1
FROM pg_catalog.pg_database
WHERE datname OPERATOR(pg_catalog.=) {datname}::pg_catalog.name
WHERE datname = {datname}
)
THEN
EXECUTE pg_catalog.format('ALTER DATABASE %I is_template false', {datname});
EXECUTE format('ALTER DATABASE %I is_template false', {datname});
END IF;
END
${outer_tag}$;

View File

@@ -3,42 +3,43 @@ use std::{io::Write, os::unix::fs::OpenOptionsExt, path::Path, time::Duration};
use anyhow::{Context, Result, bail};
use compute_api::responses::TlsConfig;
use ring::digest;
use x509_cert::Certificate;
#[derive(Clone, Copy)]
pub struct CertDigest(digest::Digest);
pub async fn watch_cert_for_changes(cert_path: String) -> tokio::sync::watch::Receiver<CertDigest> {
let mut digest = compute_digest(&cert_path).await;
let (tx, rx) = tokio::sync::watch::channel(digest);
tokio::spawn(async move {
while !tx.is_closed() {
let new_digest = compute_digest(&cert_path).await;
if digest.0.as_ref() != new_digest.0.as_ref() {
digest = new_digest;
_ = tx.send(digest);
}
tokio::time::sleep(Duration::from_secs(60)).await
}
});
rx
impl PartialEq for CertDigest {
fn eq(&self, other: &Self) -> bool {
self.0.as_ref() == other.0.as_ref()
}
}
async fn compute_digest(cert_path: &str) -> CertDigest {
pub fn wait_until_cert_changed(digest: CertDigest, cert_path: &str) -> CertDigest {
loop {
match try_compute_digest(cert_path).await {
let new_digest = compute_digest(cert_path);
if digest != new_digest {
break new_digest;
}
// Wait a while before checking the certificates.
// We renew on a daily basis, so there's no rush.
std::thread::sleep(Duration::from_secs(60));
}
}
pub fn compute_digest(cert_path: &str) -> CertDigest {
loop {
match try_compute_digest(cert_path) {
Ok(d) => break d,
Err(e) => {
tracing::error!("could not read cert file {e:?}");
tokio::time::sleep(Duration::from_secs(1)).await
std::thread::sleep(Duration::from_secs(1))
}
}
}
}
async fn try_compute_digest(cert_path: &str) -> Result<CertDigest> {
let data = tokio::fs::read(cert_path).await?;
fn try_compute_digest(cert_path: &str) -> Result<CertDigest> {
let data = std::fs::read(cert_path)?;
// sha256 is extremely collision resistent. can safely assume the digest to be unique
Ok(CertDigest(digest::digest(&digest::SHA256, &data)))
}
@@ -46,28 +47,37 @@ async fn try_compute_digest(cert_path: &str) -> Result<CertDigest> {
pub const SERVER_CRT: &str = "server.crt";
pub const SERVER_KEY: &str = "server.key";
pub fn update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) {
pub struct KeyPair {
crt: String,
key: String,
}
pub fn load_certs_blocking(tls_config: &TlsConfig) -> KeyPair {
loop {
match try_update_key_path_blocking(pg_data, tls_config) {
Ok(()) => break,
match try_load_certs_blocking(tls_config) {
Ok(key_pair) => break key_pair,
Err(e) => {
tracing::error!(error = ?e, "could not create key file");
tracing::error!(error = ?e, "could not load certs");
std::thread::sleep(Duration::from_secs(1))
}
}
}
}
// Postgres requires the keypath be "secure". This means
// 1. Owned by the postgres user.
// 2. Have permission 600.
fn try_update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) -> Result<()> {
fn try_load_certs_blocking(tls_config: &TlsConfig) -> Result<KeyPair> {
let key = std::fs::read_to_string(&tls_config.key_path)?;
let crt = std::fs::read_to_string(&tls_config.cert_path)?;
// to mitigate a race condition during renewal.
verify_key_cert(&key, &crt)?;
Ok(KeyPair { key, crt })
}
// Postgres requires the keypath be "secure". This means
// 1. Owned by the postgres user.
// 2. Have permission 600.
pub fn update_key_path_blocking(pg_data: &Path, key_pair: &KeyPair) -> Result<()> {
let mut key_file = std::fs::OpenOptions::new()
.write(true)
.create(true)
@@ -82,14 +92,22 @@ fn try_update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) -> Resul
.mode(0o600)
.open(pg_data.join(SERVER_CRT))?;
key_file.write_all(key.as_bytes())?;
crt_file.write_all(crt.as_bytes())?;
// NOTE: We currently ensure that an explicit reload does not happen during TLS renewal, but
// there's a chance that postgres/pgbouncer/local_proxy reloads implicitly halfway between
// these writes. This could allow them to reads the wrong keys to the wrong certs.
// There doesn't seem to be any way to prevent that. However, we will issue a reload shortly
// after which should at least correct it.
key_file.write_all(key_pair.key.as_bytes())?;
crt_file.write_all(key_pair.crt.as_bytes())?;
Ok(())
}
fn verify_key_cert(key: &str, cert: &str) -> Result<()> {
use x509_cert::Certificate;
use x509_cert::der::oid::db::rfc5912::ECDSA_WITH_SHA_256;
use x509_cert::der::oid::db::rfc8410::ID_ED_25519;
use x509_cert::der::pem;
let certs = Certificate::load_pem_chain(cert.as_bytes())
.context("decoding PEM encoded certificates")?;
@@ -100,22 +118,30 @@ fn verify_key_cert(key: &str, cert: &str) -> Result<()> {
bail!("no certificates found");
};
let pubkey = cert
.tbs_certificate
.subject_public_key_info
.subject_public_key
.raw_bytes();
match cert.signature_algorithm.oid {
ECDSA_WITH_SHA_256 => {
let key = p256::SecretKey::from_sec1_pem(key).context("parse key")?;
let a = key.public_key().to_sec1_bytes();
let b = cert
.tbs_certificate
.subject_public_key_info
.subject_public_key
.raw_bytes();
if *a != *b {
if *key.public_key().to_sec1_bytes() != *pubkey {
bail!("private key file does not match certificate")
}
}
_ => bail!("unknown TLS key type"),
ID_ED_25519 => {
use ring::signature::{Ed25519KeyPair, KeyPair};
let (_, bytes) = pem::decode_vec(key.as_bytes())
.map_err(|_| anyhow::anyhow!("invalid key encoding"))?;
let key = Ed25519KeyPair::from_pkcs8_maybe_unchecked(&bytes).context("parse key")?;
if *key.public_key().as_ref() != *pubkey {
bail!("private key file does not match certificate")
}
}
oid => bail!("unknown TLS key type: {oid}"),
}
Ok(())

View File

@@ -1089,7 +1089,8 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
default_tenant_id: TenantId::from_array(std::array::from_fn(|_| 0)),
storage_controller: None,
control_plane_hooks_api: None,
generate_local_ssl_certs: false,
generate_local_tls_certs: false,
generate_compute_tls_certs: false,
}
};

View File

@@ -23,7 +23,7 @@ impl StorageBroker {
}
pub fn initialize(&self) -> anyhow::Result<()> {
if self.env.generate_local_ssl_certs {
if self.env.generate_local_tls_certs {
self.env.generate_ssl_cert(
&self.env.storage_broker_data_dir().join("server.crt"),
&self.env.storage_broker_data_dir().join("server.key"),

View File

@@ -54,7 +54,6 @@ use compute_api::requests::{
};
use compute_api::responses::{
ComputeConfig, ComputeCtlConfig, ComputeStatus, ComputeStatusResponse, TerminateResponse,
TlsConfig,
};
use compute_api::spec::{
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol,
@@ -213,8 +212,13 @@ impl ComputeControlPlane {
let internal_http_port = internal_http_port.unwrap_or_else(|| external_http_port + 1);
let compute_ctl_config = ComputeCtlConfig {
jwks: Self::create_jwks_from_pem(&self.env.read_public_key()?)?,
tls: None::<TlsConfig>,
tls: self.env.get_tls_config()?,
};
let mut features = vec![];
if compute_ctl_config.tls.is_some() {
features.push(ComputeFeature::TlsExperimental);
}
let ep = Arc::new(Endpoint {
endpoint_id: endpoint_id.to_owned(),
pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), pg_port),
@@ -241,7 +245,7 @@ impl ComputeControlPlane {
drop_subscriptions_before_start,
grpc,
reconfigure_concurrency: 1,
features: vec![],
features: features.clone(),
cluster: None,
compute_ctl_config: compute_ctl_config.clone(),
privileged_role_name: privileged_role_name.clone(),
@@ -263,7 +267,7 @@ impl ComputeControlPlane {
skip_pg_catalog_updates,
drop_subscriptions_before_start,
reconfigure_concurrency: 1,
features: vec![],
features,
cluster: None,
compute_ctl_config,
privileged_role_name,
@@ -953,7 +957,7 @@ impl Endpoint {
}
// keep retrying
}
ComputeStatus::Running => {
ComputeStatus::Reloading | ComputeStatus::Running => {
// All good!
break;
}

View File

@@ -12,6 +12,7 @@ use std::{env, fs};
use anyhow::{Context, bail};
use clap::ValueEnum;
use compute_api::responses::TlsConfig;
use pageserver_api::config::PostHogConfig;
use pem::Pem;
use postgres_backend::AuthType;
@@ -95,7 +96,10 @@ pub struct LocalEnv {
/// Flag to generate SSL certificates for components that need it.
/// Also generates root CA certificate that is used to sign all other certificates.
pub generate_local_ssl_certs: bool,
pub generate_local_tls_certs: bool,
/// Flag to generate SSL certificates for compute.
pub generate_compute_tls_certs: bool,
}
/// On-disk state stored in `.neon/config`.
@@ -123,7 +127,11 @@ pub struct OnDiskConfig {
// Note: skip serializing because in compat tests old storage controller fails
// to load new config file. May be removed after this field is in release branch.
#[serde(skip_serializing_if = "std::ops::Not::not")]
pub generate_local_ssl_certs: bool,
pub generate_local_tls_certs: bool,
// Note: skip serializing because in compat tests old storage controller fails
// to load new config file. May be removed after this field is in release branch.
#[serde(skip_serializing_if = "std::ops::Not::not")]
pub generate_compute_tls_certs: bool,
}
fn fail_if_pageservers_field_specified<'de, D>(_: D) -> Result<Vec<PageServerConf>, D::Error>
@@ -152,7 +160,8 @@ pub struct NeonLocalInitConf {
pub endpoint_storage: EndpointStorageConf,
pub control_plane_api: Option<Url>,
pub control_plane_hooks_api: Option<Url>,
pub generate_local_ssl_certs: bool,
pub generate_local_tls_certs: bool,
pub generate_compute_tls_certs: bool,
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
@@ -511,7 +520,7 @@ impl LocalEnv {
}
pub fn ssl_ca_cert_path(&self) -> Option<PathBuf> {
if self.generate_local_ssl_certs {
if self.generate_local_tls_certs {
Some(self.base_data_dir.join("rootCA.crt"))
} else {
None
@@ -519,7 +528,7 @@ impl LocalEnv {
}
pub fn ssl_ca_key_path(&self) -> Option<PathBuf> {
if self.generate_local_ssl_certs {
if self.generate_local_tls_certs {
Some(self.base_data_dir.join("rootCA.key"))
} else {
None
@@ -545,6 +554,33 @@ impl LocalEnv {
)
}
fn compute_ssl_paths(&self) -> Option<(PathBuf, PathBuf)> {
if self.generate_compute_tls_certs {
Some((
self.base_data_dir.join("compute_server.crt"),
self.base_data_dir.join("compute_server.key"),
))
} else {
None
}
}
pub fn generate_compute_ssl_cert(&self) -> anyhow::Result<()> {
self.generate_ssl_ca_cert()?;
let (cert_path, key_path) = self.compute_ssl_paths().unwrap();
if !fs::exists(&cert_path)? {
generate_ssl_cert(
&cert_path,
&key_path,
self.ssl_ca_cert_path().unwrap().as_path(),
self.ssl_ca_key_path().unwrap().as_path(),
)?;
}
Ok(())
}
/// Creates HTTP client with local SSL CA certificates.
pub fn create_http_client(&self) -> reqwest::Client {
let ssl_ca_certs = self.ssl_ca_cert_path().map(|ssl_ca_file| {
@@ -673,7 +709,8 @@ impl LocalEnv {
control_plane_hooks_api,
control_plane_compute_hook_api: _,
branch_name_mappings,
generate_local_ssl_certs,
generate_local_tls_certs,
generate_compute_tls_certs,
endpoint_storage,
} = on_disk_config;
LocalEnv {
@@ -690,7 +727,8 @@ impl LocalEnv {
control_plane_api: control_plane_api.unwrap(),
control_plane_hooks_api,
branch_name_mappings,
generate_local_ssl_certs,
generate_local_tls_certs,
generate_compute_tls_certs,
endpoint_storage,
}
};
@@ -806,7 +844,8 @@ impl LocalEnv {
control_plane_hooks_api: self.control_plane_hooks_api.clone(),
control_plane_compute_hook_api: None,
branch_name_mappings: self.branch_name_mappings.clone(),
generate_local_ssl_certs: self.generate_local_ssl_certs,
generate_local_tls_certs: self.generate_local_tls_certs,
generate_compute_tls_certs: self.generate_compute_tls_certs,
endpoint_storage: self.endpoint_storage.clone(),
},
)
@@ -861,6 +900,21 @@ impl LocalEnv {
Ok(pem)
}
/// Get the TLS config if set.
pub fn get_tls_config(&self) -> anyhow::Result<Option<TlsConfig>> {
match self.compute_ssl_paths() {
Some((cert_path, key_path)) => {
self.generate_compute_ssl_cert()?;
Ok(Some(TlsConfig {
key_path: key_path.to_str().context("utf8")?.to_string(),
cert_path: cert_path.to_str().context("utf8")?.to_string(),
}))
}
None => Ok(None),
}
}
/// Materialize the [`NeonLocalInitConf`] to disk. Called during [`neon_local init`].
pub fn init(conf: NeonLocalInitConf, force: &InitForceMode) -> anyhow::Result<()> {
let base_path = base_path();
@@ -912,7 +966,8 @@ impl LocalEnv {
pageservers,
safekeepers,
control_plane_api,
generate_local_ssl_certs,
generate_local_tls_certs,
generate_compute_tls_certs,
control_plane_hooks_api,
endpoint_storage,
} = conf;
@@ -965,13 +1020,17 @@ impl LocalEnv {
control_plane_api: control_plane_api.unwrap(),
control_plane_hooks_api,
branch_name_mappings: Default::default(),
generate_local_ssl_certs,
generate_local_tls_certs,
generate_compute_tls_certs,
endpoint_storage,
};
if generate_local_ssl_certs {
if generate_local_tls_certs {
env.generate_ssl_ca_cert()?;
}
if generate_compute_tls_certs {
env.generate_compute_ssl_cert()?;
}
// create endpoints dir
fs::create_dir_all(env.endpoints_path())?;

View File

@@ -241,7 +241,7 @@ impl PageServerNode {
.context("write identity toml")?;
drop(identity_toml);
if self.env.generate_local_ssl_certs {
if self.env.generate_local_tls_certs {
self.env.generate_ssl_cert(
datadir.join("server.crt").as_path(),
datadir.join("server.key").as_path(),

View File

@@ -102,7 +102,7 @@ impl SafekeeperNode {
/// Initializes a safekeeper node by creating all necessary files,
/// e.g. SSL certificates and JWT token file.
pub fn initialize(&self) -> anyhow::Result<()> {
if self.env.generate_local_ssl_certs {
if self.env.generate_local_tls_certs {
self.env.generate_ssl_cert(
&self.datadir_path().join("server.crt"),
&self.datadir_path().join("server.key"),

View File

@@ -353,7 +353,7 @@ impl StorageController {
}
}
if self.env.generate_local_ssl_certs {
if self.env.generate_local_tls_certs {
self.env.generate_ssl_cert(
&instance_dir.join("server.crt"),
&instance_dir.join("server.key"),

View File

@@ -120,11 +120,6 @@
"value": "host=pageserver port=6400",
"vartype": "string"
},
{
"name": "neon.pageserver_grpc_urls",
"value": "grpc://pageserver:6401/",
"vartype": "string"
},
{
"name": "max_replication_write_lag",
"value": "500MB",

View File

@@ -1,7 +1,6 @@
broker_endpoint='http://storage_broker:50051'
pg_distrib_dir='/usr/local/'
listen_pg_addr='0.0.0.0:6400'
listen_grpc_addr='0.0.0.0:6401'
listen_http_addr='0.0.0.0:9898'
remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }
control_plane_api='http://0.0.0.0:6666' # No storage controller in docker compose, specify a junk address

View File

@@ -27,7 +27,6 @@ pub struct ComputeConfig {
pub spec: Option<ComputeSpec>,
/// The compute_ctl configuration
#[allow(dead_code)]
pub compute_ctl_config: ComputeCtlConfig,
}
@@ -155,6 +154,8 @@ pub enum ComputeStatus {
Empty,
// Compute configuration was requested.
ConfigurationPending,
// Postgres, pgbouncer, and local_proxy is currently being reloaded.
Reloading,
// Compute node has spec and initial startup and
// configuration is in progress.
Init,
@@ -189,6 +190,7 @@ impl Display for ComputeStatus {
match self {
ComputeStatus::Empty => f.write_str("empty"),
ComputeStatus::ConfigurationPending => f.write_str("configuration-pending"),
ComputeStatus::Reloading => f.write_str("reloading"),
ComputeStatus::RefreshConfiguration => f.write_str("refresh-configuration"),
ComputeStatus::RefreshConfigurationPending => {
f.write_str("refresh-configuration-pending")

View File

@@ -6,26 +6,15 @@ license.workspace = true
[dependencies]
thiserror.workspace = true
nix.workspace = true
nix.workspace=true
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
libc.workspace = true
lock_api.workspace = true
rustc-hash.workspace = true
[dev-dependencies]
criterion = { workspace = true, features = ["html_reports"] }
rand = "0.9"
rand_distr = "0.5.1"
xxhash-rust = { version = "0.8.15", features = ["xxh3"] }
ahash.workspace = true
twox-hash = { version = "2.1.1" }
seahash = "4.1.0"
hashbrown = { git = "https://github.com/quantumish/hashbrown.git", rev = "6610e6d" }
[target.'cfg(target_os = "macos")'.dependencies]
tempfile = "3.14.0"
[[bench]]
name = "hmap_resize"
harness = false
[dev-dependencies]
rand.workspace = true
rand_distr = "0.5.1"

View File

@@ -1,330 +0,0 @@
use criterion::{BatchSize, BenchmarkId, Criterion, criterion_group, criterion_main};
use neon_shmem::hash::HashMapAccess;
use neon_shmem::hash::HashMapInit;
use neon_shmem::hash::entry::Entry;
use rand::distr::{Distribution, StandardUniform};
use rand::prelude::*;
use std::default::Default;
use std::hash::BuildHasher;
// Taken from bindings to C code
#[derive(Clone, Debug, Hash, Eq, PartialEq)]
#[repr(C)]
pub struct FileCacheKey {
pub _spc_id: u32,
pub _db_id: u32,
pub _rel_number: u32,
pub _fork_num: u32,
pub _block_num: u32,
}
impl Distribution<FileCacheKey> for StandardUniform {
// questionable, but doesn't need to be good randomness
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> FileCacheKey {
FileCacheKey {
_spc_id: rng.random(),
_db_id: rng.random(),
_rel_number: rng.random(),
_fork_num: rng.random(),
_block_num: rng.random(),
}
}
}
#[derive(Clone, Debug)]
#[repr(C)]
pub struct FileCacheEntry {
pub _offset: u32,
pub _access_count: u32,
pub _prev: *mut FileCacheEntry,
pub _next: *mut FileCacheEntry,
pub _state: [u32; 8],
}
impl FileCacheEntry {
fn dummy() -> Self {
Self {
_offset: 0,
_access_count: 0,
_prev: std::ptr::null_mut(),
_next: std::ptr::null_mut(),
_state: [0; 8],
}
}
}
// Utilities for applying operations.
#[derive(Clone, Debug)]
struct TestOp<K, V>(K, Option<V>);
fn apply_op<K: Clone + std::hash::Hash + Eq, V, S: std::hash::BuildHasher>(
op: TestOp<K, V>,
map: &mut HashMapAccess<K, V, S>,
) {
let entry = map.entry(op.0);
match op.1 {
Some(new) => match entry {
Entry::Occupied(mut e) => Some(e.insert(new)),
Entry::Vacant(e) => {
_ = e.insert(new).unwrap();
None
}
},
None => match entry {
Entry::Occupied(e) => Some(e.remove()),
Entry::Vacant(_) => None,
},
};
}
// Hash utilities
struct SeaRandomState {
k1: u64,
k2: u64,
k3: u64,
k4: u64,
}
impl std::hash::BuildHasher for SeaRandomState {
type Hasher = seahash::SeaHasher;
fn build_hasher(&self) -> Self::Hasher {
seahash::SeaHasher::with_seeds(self.k1, self.k2, self.k3, self.k4)
}
}
impl SeaRandomState {
fn new() -> Self {
let mut rng = rand::rng();
Self {
k1: rng.random(),
k2: rng.random(),
k3: rng.random(),
k4: rng.random(),
}
}
}
fn small_benchs(c: &mut Criterion) {
let mut group = c.benchmark_group("Small maps");
group.sample_size(10);
group.bench_function("small_rehash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2).attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("small_rehash_xxhash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2)
.with_hasher(twox_hash::xxhash64::RandomState::default())
.attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("small_rehash_ahash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2)
.with_hasher(ahash::RandomState::default())
.attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("small_rehash_seahash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2)
.with_hasher(SeaRandomState::new())
.attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.finish();
}
fn real_benchs(c: &mut Criterion) {
let mut group = c.benchmark_group("Realistic workloads");
group.sample_size(10);
group.bench_function("real_bulk_insert", |b| {
let size = 125_000_000;
let ideal_filled = 100_000_000;
let mut rng = rand::rng();
b.iter_batched(
|| HashMapInit::new_resizeable(size, size * 2).attach_writer(),
|writer| {
for _ in 0..ideal_filled {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
let entry = writer.entry(key);
match entry {
Entry::Occupied(mut e) => {
std::hint::black_box(e.insert(val));
}
Entry::Vacant(e) => {
let _ = std::hint::black_box(e.insert(val).unwrap());
}
}
}
},
BatchSize::SmallInput,
)
});
group.bench_function("real_rehash", |b| {
let size = 125_000_000;
let ideal_filled = 100_000_000;
let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("real_rehash_hashbrown", |b| {
let size = 125_000_000;
let ideal_filled = 100_000_000;
let mut writer = hashbrown::raw::RawTable::new();
let mut rng = rand::rng();
let hasher = rustc_hash::FxBuildHasher;
unsafe {
writer
.resize(
size,
|(k, _)| hasher.hash_one(k),
hashbrown::raw::Fallibility::Infallible,
)
.unwrap();
}
while writer.len() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
hasher.hash_one(k)
});
}
b.iter(|| unsafe {
writer.table.rehash_in_place(
&|table, index| {
hasher.hash_one(
&table
.bucket::<(FileCacheKey, FileCacheEntry)>(index)
.as_ref()
.0,
)
},
std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
Some(|ptr| std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry)))
} else {
None
},
)
});
});
for elems in [2, 4, 8, 16, 32, 64, 96, 112] {
group.bench_with_input(
BenchmarkId::new("real_rehash_varied", elems),
&elems,
|b, &size| {
let ideal_filled = size * 1_000_000;
let size = 125_000_000;
let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
},
);
group.bench_with_input(
BenchmarkId::new("real_rehash_varied_hashbrown", elems),
&elems,
|b, &size| {
let ideal_filled = size * 1_000_000;
let size = 125_000_000;
let mut writer = hashbrown::raw::RawTable::new();
let mut rng = rand::rng();
let hasher = rustc_hash::FxBuildHasher;
unsafe {
writer
.resize(
size,
|(k, _)| hasher.hash_one(k),
hashbrown::raw::Fallibility::Infallible,
)
.unwrap();
}
while writer.len() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
hasher.hash_one(k)
});
}
b.iter(|| unsafe {
writer.table.rehash_in_place(
&|table, index| {
hasher.hash_one(
&table
.bucket::<(FileCacheKey, FileCacheEntry)>(index)
.as_ref()
.0,
)
},
std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
Some(|ptr| {
std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry))
})
} else {
None
},
)
});
},
);
}
group.finish();
}
criterion_group!(benches, small_benchs, real_benchs);
criterion_main!(benches);

View File

@@ -16,7 +16,6 @@
//!
//! Concurrency is managed very simply: the entire map is guarded by one shared-memory RwLock.
use std::fmt::Debug;
use std::hash::{BuildHasher, Hash};
use std::mem::MaybeUninit;
@@ -57,22 +56,6 @@ pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> {
num_buckets: u32,
}
impl<'a, K, V, S> Debug for HashMapInit<'a, K, V, S>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("HashMapInit")
.field("shmem_handle", &self.shmem_handle)
.field("shared_ptr", &self.shared_ptr)
.field("shared_size", &self.shared_size)
// .field("hasher", &self.hasher)
.field("num_buckets", &self.num_buckets)
.finish()
}
}
/// This is a per-process handle to a hash table that (possibly) lives in shared memory.
/// If a child process is launched with fork(), the child process should
/// get its own HashMapAccess by calling HashMapInit::attach_writer/reader().
@@ -88,20 +71,6 @@ pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> {
unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<'_, K, V, S> {}
unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<'_, K, V, S> {}
impl<'a, K, V, S> Debug for HashMapAccess<'a, K, V, S>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("HashMapAccess")
.field("shmem_handle", &self.shmem_handle)
.field("shared_ptr", &self.shared_ptr)
// .field("hasher", &self.hasher)
.finish()
}
}
impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
/// Change the 'hasher' used by the hash table.
///
@@ -329,7 +298,7 @@ where
/// Get a reference to the entry containing a key.
///
/// NB: This takes a write lock as there's no way to distinguish whether the intention
/// NB: THis takes a write lock as there's no way to distinguish whether the intention
/// is to use the entry for reading or for writing in advance.
pub fn entry(&self, key: K) -> Entry<'a, '_, K, V> {
let hash = self.get_hash_value(&key);

View File

@@ -1,6 +1,5 @@
//! Simple hash table with chaining.
use std::fmt::Debug;
use std::hash::Hash;
use std::mem::MaybeUninit;
@@ -18,19 +17,6 @@ pub(crate) struct Bucket<K, V> {
pub(crate) inner: Option<(K, V)>,
}
impl<K, V> Debug for Bucket<K, V>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Bucket")
.field("next", &self.next)
.field("inner", &self.inner)
.finish()
}
}
/// Core hash table implementation.
pub(crate) struct CoreHashMap<'a, K, V> {
/// Dictionary used to map hashes to bucket indices.
@@ -45,22 +31,6 @@ pub(crate) struct CoreHashMap<'a, K, V> {
pub(crate) buckets_in_use: u32,
}
impl<'a, K, V> Debug for CoreHashMap<'a, K, V>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("CoreHashMap")
.field("dictionary", &self.dictionary)
.field("buckets", &self.buckets)
.field("free_head", &self.free_head)
.field("alloc_limit", &self.alloc_limit)
.field("buckets_in_use", &self.buckets_in_use)
.finish()
}
}
/// Error for when there are no empty buckets left but one is needed.
#[derive(Debug, PartialEq)]
pub struct FullError;

View File

@@ -61,10 +61,6 @@ impl<K, V> OccupiedEntry<'_, '_, K, V> {
///
/// This may result in multiple bucket accesses if the entry was obtained by index as the
/// previous chain entry needs to be discovered in this case.
///
/// # Panics
/// Panics if the `prev_pos` field is equal to [`PrevPos::Unknown`]. In practice, this means
/// the entry was obtained via calling something like [`super::HashMapAccess::entry_at_bucket`].
pub fn remove(mut self) -> V {
// If this bucket was queried by index, go ahead and follow its chain from the start.
let prev = if let PrevPos::Unknown(hash) = self.prev_pos {

View File

@@ -21,7 +21,6 @@ use nix::unistd::ftruncate as nix_ftruncate;
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
/// will cause the file to be expanded, but we might use `mprotect()` etc. to enforce that in the
/// future.
#[derive(Debug)]
pub struct ShmemHandle {
/// memfd file descriptor
fd: OwnedFd,
@@ -36,7 +35,6 @@ pub struct ShmemHandle {
}
/// This is stored at the beginning in the shared memory area.
#[derive(Debug)]
struct SharedStruct {
max_size: usize,

View File

@@ -310,11 +310,6 @@ impl AtomicLsn {
}
}
/// Consumes the atomic and returns the contained value.
pub const fn into_inner(self) -> Lsn {
Lsn(self.inner.into_inner())
}
/// Atomically retrieve the `Lsn` value from memory.
pub fn load(&self) -> Lsn {
Lsn(self.inner.load(Ordering::Acquire))

View File

@@ -341,34 +341,6 @@ extern "C-unwind" fn log_internal(
}
}
/* BEGIN_HADRON */
extern "C" fn reset_safekeeper_statuses_for_metrics(wp: *mut WalProposer, num_safekeepers: u32) {
unsafe {
let callback_data = (*(*wp).config).callback_data;
let api = callback_data as *mut Box<dyn ApiImpl>;
if api.is_null() {
return;
}
(*api).reset_safekeeper_statuses_for_metrics(&mut (*wp), num_safekeepers);
}
}
extern "C" fn update_safekeeper_status_for_metrics(
wp: *mut WalProposer,
sk_index: u32,
status: u8,
) {
unsafe {
let callback_data = (*(*wp).config).callback_data;
let api = callback_data as *mut Box<dyn ApiImpl>;
if api.is_null() {
return;
}
(*api).update_safekeeper_status_for_metrics(&mut (*wp), sk_index, status);
}
}
/* END_HADRON */
#[derive(Debug, PartialEq)]
pub enum Level {
Debug5,
@@ -442,10 +414,6 @@ pub(crate) fn create_api() -> walproposer_api {
finish_sync_safekeepers: Some(finish_sync_safekeepers),
process_safekeeper_feedback: Some(process_safekeeper_feedback),
log_internal: Some(log_internal),
/* BEGIN_HADRON */
reset_safekeeper_statuses_for_metrics: Some(reset_safekeeper_statuses_for_metrics),
update_safekeeper_status_for_metrics: Some(update_safekeeper_status_for_metrics),
/* END_HADRON */
}
}
@@ -483,8 +451,6 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
replica_promote: false,
min_ps_feedback: empty_feedback,
wal_rate_limiter: empty_wal_rate_limiter,
num_safekeepers: 0,
safekeeper_status: [0; 32],
}
}

View File

@@ -159,21 +159,6 @@ pub trait ApiImpl {
fn after_election(&self, _wp: &mut WalProposer) {
todo!()
}
/* BEGIN_HADRON */
fn reset_safekeeper_statuses_for_metrics(&self, _wp: &mut WalProposer, _num_safekeepers: u32) {
// Do nothing for testing purposes.
}
fn update_safekeeper_status_for_metrics(
&self,
_wp: &mut WalProposer,
_sk_index: u32,
_status: u8,
) {
// Do nothing for testing purposes.
}
/* END_HADRON */
}
#[derive(Debug)]

View File

@@ -54,7 +54,6 @@ pageserver_api.workspace = true
pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
pageserver_compaction.workspace = true
pageserver_page_api.workspace = true
peekable.workspace = true
pem.workspace = true
pin-project-lite.workspace = true
postgres_backend.workspace = true
@@ -67,7 +66,6 @@ postgres-types.workspace = true
posthog_client_lite.workspace = true
pprof.workspace = true
pq_proto.workspace = true
prost.workspace = true
rand.workspace = true
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
regex.workspace = true

View File

@@ -3,4 +3,3 @@ mod pool;
mod retry;
pub use client::{PageserverClient, ShardSpec};
pub use pageserver_api::shard::ShardStripeSize; // used in ShardSpec

View File

@@ -33,8 +33,6 @@ pub enum ProtocolError {
Invalid(&'static str, String),
#[error("required field '{0}' is missing")]
Missing(&'static str),
#[error("invalid combination of not_modified_lsn '{0}' and request_lsn '{1}'")]
InvalidLsns(Lsn, Lsn),
}
impl ProtocolError {
@@ -87,9 +85,9 @@ impl TryFrom<proto::ReadLsn> for ReadLsn {
return Err(ProtocolError::invalid("request_lsn", pb.request_lsn));
}
if pb.not_modified_since_lsn > pb.request_lsn {
return Err(ProtocolError::InvalidLsns(
Lsn(pb.not_modified_since_lsn),
Lsn(pb.request_lsn),
return Err(ProtocolError::invalid(
"not_modified_since_lsn",
pb.not_modified_since_lsn,
));
}
Ok(Self {

View File

@@ -25,9 +25,6 @@ tracing.workspace = true
tokio.workspace = true
tokio-stream.workspace = true
tokio-util.workspace = true
axum.workspace = true
http.workspace = true
metrics.workspace = true
tonic.workspace = true
url.workspace = true

View File

@@ -34,10 +34,6 @@ use crate::util::{request_stats, tokio_thread_local_stats};
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
#[derive(clap::Parser)]
pub(crate) struct Args {
#[clap(long, default_value = "false")]
grpc: bool,
#[clap(long, default_value = "false")]
grpc_stream: bool,
#[clap(long, default_value = "http://localhost:9898")]
mgmt_api_endpoint: String,
/// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
@@ -82,9 +78,6 @@ pub(crate) struct Args {
#[clap(long)]
set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,
#[clap(long)]
only_relnode: Option<u32>,
/// Queue depth generated in each client.
#[clap(long, default_value = "1")]
queue_depth: NonZeroUsize,
@@ -99,31 +92,10 @@ pub(crate) struct Args {
#[clap(long, default_value = "1")]
batch_size: NonZeroUsize,
#[clap(long)]
only_relnode: Option<u32>,
targets: Option<Vec<TenantTimelineId>>,
#[clap(long, default_value = "100")]
pool_max_consumers: NonZeroUsize,
#[clap(long, default_value = "5")]
pool_error_threshold: NonZeroUsize,
#[clap(long, default_value = "5000")]
pool_connect_timeout: NonZeroUsize,
#[clap(long, default_value = "1000")]
pool_connect_backoff: NonZeroUsize,
#[clap(long, default_value = "60000")]
pool_max_idle_duration: NonZeroUsize,
#[clap(long, default_value = "0")]
max_delay_ms: usize,
#[clap(long, default_value = "0")]
percent_drops: usize,
#[clap(long, default_value = "0")]
percent_hangs: usize,
}
/// State shared by all clients
@@ -180,6 +152,7 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
main_impl(args, thread_local_stats)
})
}
async fn main_impl(
args: Args,
all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
@@ -344,7 +317,6 @@ async fn main_impl(
let rps_period = args
.per_client_rate
.map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
let ss = shared_state.clone();
let cancel = cancel.clone();

View File

@@ -453,7 +453,6 @@ impl TimelineHandles {
handles: Default::default(),
}
}
async fn get(
&mut self,
tenant_id: TenantId,

View File

@@ -5,12 +5,10 @@ MODULE_big = neon
OBJS = \
$(WIN32RES) \
communicator.o \
communicator_new.o \
communicator_process.o \
extension_server.o \
file_cache.o \
hll.o \
lfc_prewarm.o \
libpagestore.o \
logical_replication_monitor.o \
neon.o \
@@ -69,7 +67,6 @@ WALPROP_OBJS = \
# libcommunicator.a is built by cargo from the Rust sources under communicator/
# subdirectory. `cargo build` also generates communicator_bindings.h.
communicator_new.o: communicator/communicator_bindings.h
communicator_process.o: communicator/communicator_bindings.h
file_cache.o: communicator/communicator_bindings.h

View File

@@ -17,31 +17,12 @@ rest_broker = []
[dependencies]
axum.workspace = true
bytes.workspace = true
clashmap.workspace = true
http.workspace = true
libc.workspace = true
nix.workspace = true
atomic_enum = "0.3.0"
measured.workspace = true
prometheus.workspace = true
prost.workspace = true
strum_macros.workspace = true
thiserror.workspace = true
tonic = { workspace = true, default-features = false, features=["codegen", "prost", "transport"] }
tokio = { workspace = true, features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] }
tokio-pipe = { version = "0.2.12" }
tracing.workspace = true
tracing-subscriber.workspace = true
metrics.workspace = true
uring-common = { workspace = true, features = ["bytes"] }
pageserver_client_grpc.workspace = true
pageserver_api.workspace = true
pageserver_page_api.workspace = true
neon-shmem.workspace = true
measured.workspace = true
utils.workspace = true
workspace_hack = { version = "0.1", path = "../../../workspace_hack" }

View File

@@ -3,18 +3,9 @@
This package provides the so-called "compute-pageserver communicator",
or just "communicator" in short. The communicator is a separate
background worker process that runs in the PostgreSQL server. It's
part of the neon extension.
The commuicator handles the communication with the pageservers, and
also provides an HTTP endpoint for metrics over a local Unix Domain
socket (aka. the "communicator control socket"). On the PostgreSQL
side, the glue code in pgxn/neon/ uses the communicator to implement
the PostgreSQL Storage Manager (SMGR) interface.
## Design criteria
- Low latency
- Saturate a 10 Gbit / s network interface without becoming a bottleneck
part of the neon extension. Currently, it only provides an HTTP
endpoint for metrics, but in the future it will evolve to handle all
communications with the pageservers.
## Source code view
@@ -23,122 +14,10 @@ pgxn/neon/communicator_process.c
the glue that interacts with PostgreSQL code and the Rust
code in the communicator process.
pgxn/neon/communicator_new.c
Contains the backend code that interacts with the communicator
process.
pgxn/neon/communicator/src/backend_interface.rs
The entry point for calls from each backend.
pgxn/neon/communicator/src/init.rs
Initialization at server startup
pgxn/neon/communicator/src/worker_process/
Worker process main loop and glue code
At compilation time, pgxn/neon/communicator/ produces a static
library, libcommunicator.a. It is linked to the neon.so extension
library.
The real networking code, which is independent of PostgreSQL, is in
the pageserver/client_grpc crate.
## Process view
The communicator runs in a dedicated background worker process, the
"communicator process". The communicator uses a multi-threaded Tokio
runtime to execute the IO requests. So the communicator process has
multiple threads running. That's unusual for Postgres processes and
care must be taken to make that work.
### Backend <-> worker communication
Each backend has a number of I/O request slots in shared memory. The
slots are statically allocated for each backend, and must not be
accessed by other backends. The worker process reads requests from the
shared memory slots, and writes responses back to the slots.
Here's an example snapshot of the system, when two requests from two
different backends are in progress:
```
Backends Request slots Communicator process
--------- ------------- --------------------
Backend 1 1: Idle
2: Idle
3: Processing tokio task handling request 3
Backend 2 4: Completed
5: Processing tokio task handling request 5
6: Idle
... ...
```
To submit an IO request, the backend first picks one of its Idle
slots, writes the IO request in the slot, and updates it to
'Submitted' state. That transfers the ownership of the slot to the
worker process, until the worker process marks the request as
Completed. The worker process spawns a separate Tokio task for each
request.
To inform the worker process that a request slot has a pending IO
request, there's a pipe shared by the worker process and all backend
processes. The backend writes the index of the request slot to the
pipe after changing the slot's state to Submitted. This wakes up the
worker process.
(Note that the pipe is just used for wakeups, but the worker process
is free to pick up Submitted IO requests even without receiving the
wakeup. As of this writing, it doesn't do that, but it might be useful
in the future to reduce latency even further, for example.)
When the worker process has completed processing the request, it
writes the result back in the request slot. A GetPage request can also
contain a pointer to buffer in the shared buffer cache. In that case,
the worker process writes the resulting page contents directly to the
buffer, and just a result code in the request slot. It then updates
the 'state' field to Completed, which passes the owner ship back to
the originating backend. Finally, it signals the process Latch of the
originating backend, waking it up.
### Differences between PostgreSQL v16, v17 and v18
PostgreSQL v18 introduced the new AIO mechanism. The PostgreSQL AIO
mechanism uses a very similar mechanism as described in the previous
section, for the communication between AIO worker processes and
backends. With our communicator, the AIO worker processes are not
used, but we use the same PgAioHandle request slots as in upstream.
For Neon-specific IO requests like GetDbSize, a neon request slot is
used. But for the actual IO requests, the request slot merely contains
a pointer to the PgAioHandle slot. The worker process updates the
status of that, calls the IO callbacks upon completionetc, just like
the upstream AIO worker processes do.
## Sequence diagram
neon
PostgreSQL extension backend_interface.rs worker_process.rs processor tonic
| . . . .
| smgr_read() . . . .
+-------------> + . . .
. | . . .
. | rcommunicator_ . . .
. | get_page_at_lsn . . .
. +------------------> + . .
| . .
| write request to . . .
| slot . .
| . .
| . .
| submit_request() . .
+-----------------> + .
| | .
| | db_size_request . .
+---------------->.
. TODO
### Compute <-> pageserver protocol
The protocol between Compute and the pageserver is based on gRPC. See `protos/`.

View File

@@ -1,224 +0,0 @@
//! This module implements a request/response "slot" for submitting
//! requests from backends to the communicator process.
//!
//! NB: The "backend" side of this code runs in Postgres backend processes,
//! which means that it is not safe to use the 'tracing' crate for logging, nor
//! to launch threads or use tokio tasks!
use std::cell::UnsafeCell;
use std::sync::atomic::{AtomicI32, Ordering};
use crate::neon_request::{NeonIORequest, NeonIOResult};
use atomic_enum::atomic_enum;
/// One request/response slot. Each backend has its own set of slots that it
/// uses.
///
/// This is the moral equivalent of PgAioHandle for Postgres AIO requests
/// Like PgAioHandle, try to keep this small.
///
/// There is an array of these in shared memory. Therefore, this must be Sized.
///
/// ## Lifecycle of a request
///
/// A slot is always owned by either the backend process or the communicator
/// process, depending on the 'state'. Only the owning process is allowed to
/// read or modify the slot, except for reading the 'state' itself to check who
/// owns it.
///
/// A slot begins in the Idle state, where it is owned by the backend process.
/// To submit a request, the backend process fills the slot with the request
/// data, and changes it to the Submitted state. After changing the state, the
/// slot is owned by the communicator process, and the backend is not allowed
/// to access it until the communicator process marks it as Completed.
///
/// When the communicator process sees that the slot is in Submitted state, it
/// starts to process the request. After processing the request, it stores the
/// result in the slot, and changes the state to Completed. It is now owned by
/// the backend process again, which may now read the result, and reuse the
/// slot for a new request.
///
/// For correctness of the above protocol, we really only need two states:
/// "owned by backend" and "owned by communicator process". But to help with
/// debugging and better assertions, there are a few more states. When the
/// backend starts to fill in the request details in the slot, it first sets the
/// state from Idle to Filling, and when it's done with that, from Filling to
/// Submitted. In the Filling state, the slot is still owned by the
/// backend. Similarly, when the communicator process starts to process a
/// request, it sets it to Processing state first, but the slot is still owned
/// by the communicator process.
///
/// This struct doesn't handle waking up the communicator process when a request
/// has been submitted or when a response is ready. The 'owner_procno' is used
/// for waking up the backend on completion, but that happens elsewhere.
pub struct NeonIORequestSlot {
/// similar to PgAioHandleState
state: AtomicNeonIORequestSlotState,
/// The owning process's ProcNumber. The worker process uses this to set the
/// process's latch on completion.
///
/// (This could be calculated from num_neon_request_slots_per_backend and
/// the index of this slot in the overall 'neon_requst_slots array'. But we
/// prefer the communicator process to not know how the request slots are
/// divided between the backends.)
owner_procno: AtomicI32,
/// SAFETY: This is modified by submit_request(), after it has established
/// ownership of the slot by setting state from Idle to Filling
request: UnsafeCell<NeonIORequest>,
/// Valid when state is Completed
///
/// SAFETY: This is modified by RequestProcessingGuard::complete(). There
/// can be only one RequestProcessingGuard outstanding for a slot at a time,
/// because it is returned by start_processing_request() which checks the
/// state, so RequestProcessingGuard has exclusive access to the slot.
result: UnsafeCell<NeonIOResult>,
}
// The protocol described in the "Lifecycle of a request" section above ensures
// the safe access to the fields
unsafe impl Send for NeonIORequestSlot {}
unsafe impl Sync for NeonIORequestSlot {}
impl Default for NeonIORequestSlot {
fn default() -> NeonIORequestSlot {
NeonIORequestSlot {
owner_procno: AtomicI32::new(-1),
request: UnsafeCell::new(NeonIORequest::Empty),
result: UnsafeCell::new(NeonIOResult::Empty),
state: AtomicNeonIORequestSlotState::new(NeonIORequestSlotState::Idle),
}
}
}
#[atomic_enum]
#[derive(Eq, PartialEq)]
pub enum NeonIORequestSlotState {
Idle,
/// Backend is filling in the request
Filling,
/// Backend has submitted the request to the communicator, but the
/// communicator process has not yet started processing it.
Submitted,
/// Communicator is processing the request
Processing,
/// Communicator has completed the request, and the 'result' field is now
/// valid, but the backend has not read the result yet.
Completed,
}
impl NeonIORequestSlot {
/// Write a request to the slot, and mark it as Submitted.
///
/// Note: This does not wake up the worker process to actually process
/// the request. It's the caller's responsibility to do that.
pub fn submit_request(&self, request: &NeonIORequest, proc_number: i32) {
// Verify that the slot is in Idle state previously, and put it in
// Filling state.
//
// XXX: This step isn't strictly necessary. Assuming the caller didn't
// screw up and try to use a slot that's already in use, we could fill
// the slot and switch it directly from Idle to Submitted state.
if let Err(s) = self.state.compare_exchange(
NeonIORequestSlotState::Idle,
NeonIORequestSlotState::Filling,
Ordering::Relaxed,
Ordering::Relaxed,
) {
panic!("unexpected state in request slot: {s:?}");
}
// Fill in the request details
self.owner_procno.store(proc_number, Ordering::Relaxed);
unsafe { *self.request.get() = *request }
// This synchronizes-with store/swap in [`start_processing_request`].
// Note that this ensures that the previous non-atomic writes visible
// to other threads too.
self.state
.store(NeonIORequestSlotState::Submitted, Ordering::Release);
}
pub fn get_state(&self) -> NeonIORequestSlotState {
self.state.load(Ordering::Relaxed)
}
pub fn try_get_result(&self) -> Option<NeonIOResult> {
// This synchronizes-with the store/swap in [`RequestProcessingGuard::completed`]
let state = self.state.load(Ordering::Acquire);
if state == NeonIORequestSlotState::Completed {
let result = unsafe { *self.result.get() };
self.state
.store(NeonIORequestSlotState::Idle, Ordering::Relaxed);
Some(result)
} else {
None
}
}
/// Read the IO request from the slot indicated in the wakeup
pub fn start_processing_request<'a>(&'a self) -> Option<RequestProcessingGuard<'a>> {
// XXX: using atomic load rather than compare_exchange would be
// sufficient here, as long as the communicator process has _some_ means
// of tracking which requests it's already processing. That could be a
// flag somewhere in communicator's private memory, for example.
//
// This synchronizes-with the store in [`submit_request`].
if let Err(s) = self.state.compare_exchange(
NeonIORequestSlotState::Submitted,
NeonIORequestSlotState::Processing,
Ordering::Acquire,
Ordering::Relaxed,
) {
// FIXME surprising state. This is unexpected at the moment, but if we
// started to process requests more aggressively, without waiting for the
// read from the pipe, then this could happen
panic!("unexpected state in request slot: {s:?}");
}
Some(RequestProcessingGuard(self))
}
}
/// [`NeonIORequestSlot::start_processing_request`] returns this guard object to
/// indicate that the the caller now "owns" the slot, until it calls
/// [`RequestProcessingGuard::completed`].
///
/// TODO: implement Drop on this, to mark the request as Aborted or Errored
/// if [`RequestProcessingGuard::completed`] is not called.
pub struct RequestProcessingGuard<'a>(&'a NeonIORequestSlot);
unsafe impl<'a> Send for RequestProcessingGuard<'a> {}
unsafe impl<'a> Sync for RequestProcessingGuard<'a> {}
impl<'a> RequestProcessingGuard<'a> {
pub fn get_request(&self) -> &NeonIORequest {
unsafe { &*self.0.request.get() }
}
pub fn get_owner_procno(&self) -> i32 {
self.0.owner_procno.load(Ordering::Relaxed)
}
pub fn completed(self, result: NeonIOResult) {
// Store the result to the slot.
unsafe {
*self.0.result.get() = result;
};
// Mark the request as completed. After that, we no longer have
// ownership of the slot, and must not modify it.
let old_state = self
.0
.state
.swap(NeonIORequestSlotState::Completed, Ordering::Release);
assert!(old_state == NeonIORequestSlotState::Processing);
}
}

View File

@@ -1,296 +0,0 @@
//! This code runs in each backend process. That means that launching Rust threads, panicking
//! etc. is forbidden!
use std::os::fd::OwnedFd;
use crate::backend_comms::NeonIORequestSlot;
use crate::init::CommunicatorInitStruct;
use crate::integrated_cache::{BackendCacheReadOp, IntegratedCacheReadAccess};
use crate::neon_request::{CCachedGetPageVResult, CLsn, COid};
use crate::neon_request::{NeonIORequest, NeonIOResult};
use utils::lsn::Lsn;
pub struct CommunicatorBackendStruct<'t> {
my_proc_number: i32,
neon_request_slots: &'t [NeonIORequestSlot],
submission_pipe_write_fd: OwnedFd,
pending_cache_read_op: Option<BackendCacheReadOp<'t>>,
integrated_cache: &'t IntegratedCacheReadAccess<'t>,
}
#[unsafe(no_mangle)]
pub extern "C" fn rcommunicator_backend_init(
cis: Box<CommunicatorInitStruct>,
my_proc_number: i32,
) -> &'static mut CommunicatorBackendStruct<'static> {
if my_proc_number < 0 {
panic!("cannot attach to communicator shared memory with procnumber {my_proc_number}");
}
let integrated_cache = Box::leak(Box::new(cis.integrated_cache_init_struct.backend_init()));
let bs: &'static mut CommunicatorBackendStruct =
Box::leak(Box::new(CommunicatorBackendStruct {
my_proc_number,
neon_request_slots: cis.neon_request_slots,
submission_pipe_write_fd: cis.submission_pipe_write_fd,
pending_cache_read_op: None,
integrated_cache,
}));
bs
}
/// Start a request. You can poll for its completion and get the result by
/// calling bcomm_poll_dbsize_request_completion(). The communicator will wake
/// us up by setting our process latch, so to wait for the completion, wait on
/// the latch and call bcomm_poll_dbsize_request_completion() every time the
/// latch is set.
///
/// Safety: The C caller must ensure that the references are valid.
/// The requested slot must be free, or this panics.
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_start_io_request(
bs: &'_ mut CommunicatorBackendStruct,
slot_idx: i32,
request: &NeonIORequest,
immediate_result_ptr: &mut NeonIOResult,
) -> i32 {
assert!(bs.pending_cache_read_op.is_none());
// Check if the request can be satisfied from the cache first
if let NeonIORequest::RelSize(req) = request {
if let Some(nblocks) = bs.integrated_cache.get_rel_size(&req.reltag()) {
*immediate_result_ptr = NeonIOResult::RelSize(nblocks);
return -1;
}
}
// Create neon request and submit it
bs.start_neon_io_request(slot_idx, request);
slot_idx
}
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_start_get_page_v_request(
bs: &mut CommunicatorBackendStruct,
slot_idx: i32,
request: &NeonIORequest,
immediate_result_ptr: &mut CCachedGetPageVResult,
) -> i32 {
let NeonIORequest::GetPageV(get_pagev_request) = request else {
panic!("invalid request passed to bcomm_start_get_page_v_request()");
};
assert!(matches!(request, NeonIORequest::GetPageV(_)));
assert!(bs.pending_cache_read_op.is_none());
// Check if the request can be satisfied from the cache first
let mut all_cached = true;
let mut read_op = bs.integrated_cache.start_read_op();
for i in 0..get_pagev_request.nblocks {
if let Some(cache_block) = read_op.get_page(
&get_pagev_request.reltag(),
get_pagev_request.block_number + i as u32,
) {
immediate_result_ptr.cache_block_numbers[i as usize] = cache_block;
} else {
// not found in cache
all_cached = false;
break;
}
}
if all_cached {
bs.pending_cache_read_op = Some(read_op);
return -1;
}
// Create neon request and submit it
bs.start_neon_io_request(slot_idx, request);
slot_idx
}
/// Check if a request has completed. Returns:
///
/// -1 if the request is still being processed
/// 0 on success
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_poll_request_completion(
bs: &mut CommunicatorBackendStruct,
request_slot_idx: u32,
result_p: &mut NeonIOResult,
) -> i32 {
match bs.neon_request_slots[request_slot_idx as usize].try_get_result() {
None => -1, // still processing
Some(result) => {
*result_p = result;
0
}
}
}
/// Check if a request has completed. Returns:
///
/// 'false' if the slot is Idle. The backend process has ownership.
/// 'true' if the slot is busy, and should be polled for result.
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_get_request_slot_status(
bs: &mut CommunicatorBackendStruct,
request_slot_idx: u32,
) -> bool {
use crate::backend_comms::NeonIORequestSlotState;
match bs.neon_request_slots[request_slot_idx as usize].get_state() {
NeonIORequestSlotState::Idle => false,
NeonIORequestSlotState::Filling => {
// 'false' would be the right result here. However, this
// is a very transient state. The C code should never
// leave a slot in this state, so if it sees that,
// something's gone wrong and it's not clear what to do
// with it.
panic!("unexpected Filling state in request slot {request_slot_idx}");
}
NeonIORequestSlotState::Submitted => true,
NeonIORequestSlotState::Processing => true,
NeonIORequestSlotState::Completed => true,
}
}
// LFC functions
/// Finish a local file cache read
///
//
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_finish_cache_read(bs: &mut CommunicatorBackendStruct) -> bool {
if let Some(op) = bs.pending_cache_read_op.take() {
op.finish()
} else {
panic!("bcomm_finish_cache_read() called with no cached read pending");
}
}
/// Check if LFC contains the given buffer, and update its last-written LSN if not.
///
/// This is used in WAL replay in read replica, to skip updating pages that are
/// not in cache.
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_update_lw_lsn_for_block_if_not_cached(
bs: &mut CommunicatorBackendStruct,
spc_oid: COid,
db_oid: COid,
rel_number: u32,
fork_number: u8,
block_number: u32,
lsn: CLsn,
) -> bool {
bs.integrated_cache.update_lw_lsn_for_block_if_not_cached(
&pageserver_page_api::RelTag {
spcnode: spc_oid,
dbnode: db_oid,
relnode: rel_number,
forknum: fork_number,
},
block_number,
Lsn(lsn),
)
}
#[repr(C)]
#[derive(Clone, Debug)]
pub struct FileCacheIterator {
next_bucket: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
}
/// Iterate over LFC contents
#[allow(clippy::missing_safety_doc)]
#[unsafe(no_mangle)]
pub unsafe extern "C" fn bcomm_cache_iterate_begin(
_bs: &mut CommunicatorBackendStruct,
iter: *mut FileCacheIterator,
) {
unsafe { (*iter).next_bucket = 0 };
}
#[allow(clippy::missing_safety_doc)]
#[unsafe(no_mangle)]
pub unsafe extern "C" fn bcomm_cache_iterate_next(
bs: &mut CommunicatorBackendStruct,
iter: *mut FileCacheIterator,
) -> bool {
use crate::integrated_cache::GetBucketResult;
loop {
let next_bucket = unsafe { (*iter).next_bucket } as usize;
match bs.integrated_cache.get_bucket(next_bucket) {
GetBucketResult::Occupied(rel, blk) => {
unsafe {
(*iter).spc_oid = rel.spcnode;
(*iter).db_oid = rel.dbnode;
(*iter).rel_number = rel.relnode;
(*iter).fork_number = rel.forknum;
(*iter).block_number = blk;
(*iter).next_bucket += 1;
}
break true;
}
GetBucketResult::Vacant => {
unsafe {
(*iter).next_bucket += 1;
}
continue;
}
GetBucketResult::OutOfBounds => {
break false;
}
}
}
}
#[allow(clippy::missing_safety_doc)]
#[unsafe(no_mangle)]
pub unsafe extern "C" fn bcomm_cache_get_num_pages_used(bs: &mut CommunicatorBackendStruct) -> u64 {
bs.integrated_cache.get_num_buckets_in_use() as u64
}
impl<'t> CommunicatorBackendStruct<'t> {
/// The slot must be free, or this panics.
pub(crate) fn start_neon_io_request(&mut self, request_slot_idx: i32, request: &NeonIORequest) {
let my_proc_number = self.my_proc_number;
self.neon_request_slots[request_slot_idx as usize].submit_request(request, my_proc_number);
// Tell the communicator about it
self.notify_about_request(request_slot_idx);
}
/// Send a wakeup to the communicator process
fn notify_about_request(self: &CommunicatorBackendStruct<'t>, request_slot_idx: i32) {
// wake up communicator by writing the idx to the submission pipe
//
// This can block, if the pipe is full. That should be very rare,
// because the communicator tries hard to drain the pipe to prevent
// that. Also, there's a natural upper bound on how many wakeups can be
// queued up: there is only a limited number of request slots for each
// backend.
//
// If it does block very briefly, that's not too serious.
let idxbuf = request_slot_idx.to_ne_bytes();
let _res = nix::unistd::write(&self.submission_pipe_write_fd, &idxbuf);
// FIXME: check result, return any errors
}
}

View File

@@ -1,162 +0,0 @@
//! Implement the "low-level" parts of the file cache.
//!
//! This module just deals with reading and writing the file, and keeping track
//! which blocks in the cache file are in use and which are free. The "high
//! level" parts of tracking which block in the cache file corresponds to which
//! relation block is handled in 'integrated_cache' instead.
//!
//! This module is only used to access the file from the communicator
//! process. The backend processes *also* read the file (and sometimes also
//! write it? ), but the backends use direct C library calls for that.
use std::fs::File;
use std::os::unix::fs::FileExt;
use std::path::Path;
use std::sync::Arc;
use std::sync::Mutex;
use crate::BLCKSZ;
use tokio::task::spawn_blocking;
pub type CacheBlock = u64;
pub const INVALID_CACHE_BLOCK: CacheBlock = u64::MAX;
#[derive(Debug)]
pub struct FileCache {
file: Arc<File>,
free_list: Mutex<FreeList>,
// metrics
max_blocks_gauge: metrics::IntGauge,
num_free_blocks_gauge: metrics::IntGauge,
}
// TODO: We keep track of all free blocks in this vec. That doesn't really scale.
// Idea: when free_blocks fills up with more than 1024 entries, write them all to
// one block on disk.
#[derive(Debug)]
struct FreeList {
next_free_block: CacheBlock,
max_blocks: u64,
free_blocks: Vec<CacheBlock>,
}
impl FileCache {
pub fn new(file_cache_path: &Path, mut initial_size: u64) -> Result<FileCache, std::io::Error> {
if initial_size < 100 {
tracing::warn!(
"min size for file cache is 100 blocks, {} requested",
initial_size
);
initial_size = 100;
}
let file = std::fs::OpenOptions::new()
.read(true)
.write(true)
.truncate(true)
.create(true)
.open(file_cache_path)?;
let max_blocks_gauge = metrics::IntGauge::new(
"file_cache_max_blocks",
"Local File Cache size in 8KiB blocks",
)
.unwrap();
let num_free_blocks_gauge = metrics::IntGauge::new(
"file_cache_num_free_blocks",
"Number of free 8KiB blocks in Local File Cache",
)
.unwrap();
tracing::info!("initialized file cache with {} blocks", initial_size);
Ok(FileCache {
file: Arc::new(file),
free_list: Mutex::new(FreeList {
next_free_block: 0,
max_blocks: initial_size,
free_blocks: Vec::new(),
}),
max_blocks_gauge,
num_free_blocks_gauge,
})
}
// File cache management
pub async fn read_block(
&self,
cache_block: CacheBlock,
mut dst: impl uring_common::buf::IoBufMut + Send + Sync,
) -> Result<(), std::io::Error> {
assert!(dst.bytes_total() == BLCKSZ);
let file = self.file.clone();
let dst_ref = unsafe { std::slice::from_raw_parts_mut(dst.stable_mut_ptr(), BLCKSZ) };
spawn_blocking(move || file.read_exact_at(dst_ref, cache_block * BLCKSZ as u64)).await??;
Ok(())
}
pub async fn write_block(
&self,
cache_block: CacheBlock,
src: impl uring_common::buf::IoBuf + Send + Sync,
) -> Result<(), std::io::Error> {
assert!(src.bytes_init() == BLCKSZ);
let file = self.file.clone();
let src_ref = unsafe { std::slice::from_raw_parts(src.stable_ptr(), BLCKSZ) };
spawn_blocking(move || file.write_all_at(src_ref, cache_block * BLCKSZ as u64)).await??;
Ok(())
}
pub fn alloc_block(&self) -> Option<CacheBlock> {
let mut free_list = self.free_list.lock().unwrap();
if let Some(x) = free_list.free_blocks.pop() {
return Some(x);
}
if free_list.next_free_block < free_list.max_blocks {
let result = free_list.next_free_block;
free_list.next_free_block += 1;
return Some(result);
}
None
}
pub fn dealloc_block(&self, cache_block: CacheBlock) {
let mut free_list = self.free_list.lock().unwrap();
free_list.free_blocks.push(cache_block);
}
}
impl metrics::core::Collector for FileCache {
fn desc(&self) -> Vec<&metrics::core::Desc> {
let mut descs = Vec::new();
descs.append(&mut self.max_blocks_gauge.desc());
descs.append(&mut self.num_free_blocks_gauge.desc());
descs
}
fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
// Update the gauges with fresh values first
{
let free_list = self.free_list.lock().unwrap();
self.max_blocks_gauge.set(free_list.max_blocks as i64);
let total_free_blocks: i64 = free_list.free_blocks.len() as i64
+ (free_list.max_blocks as i64 - free_list.next_free_block as i64);
self.num_free_blocks_gauge.set(total_free_blocks);
}
let mut values = Vec::new();
values.append(&mut self.max_blocks_gauge.collect());
values.append(&mut self.num_free_blocks_gauge.collect());
values
}
}

View File

@@ -1,109 +0,0 @@
//! Global allocator, for tracking memory usage of the Rust parts
//!
//! Postgres is designed to handle allocation failure (ie. malloc() returning NULL) gracefully. It
//! rolls backs the transaction and gives the user an "ERROR: out of memory" error. Rust code
//! however panics if an allocation fails. We don't want that to ever happen, because an unhandled
//! panic leads to Postgres crash and restart. Our strategy is to pre-allocate a large enough chunk
//! of memory for use by the Rust code, so that the allocations never fail.
//!
//! To pick the size for the pre-allocated chunk, we have a metric to track the high watermark
//! memory usage of all the Rust allocations in total.
//!
//! TODO:
//!
//! - Currently we just export the metrics. Actual allocations are still just passed through to
//! the system allocator.
//! - Take padding etc. overhead into account
use std::alloc::{GlobalAlloc, Layout, System};
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
use metrics::IntGauge;
struct MyAllocator {
allocations: AtomicU64,
deallocations: AtomicU64,
allocated: AtomicUsize,
high: AtomicUsize,
}
unsafe impl GlobalAlloc for MyAllocator {
unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
self.allocations.fetch_add(1, Ordering::Relaxed);
let mut allocated = self.allocated.fetch_add(layout.size(), Ordering::Relaxed);
allocated += layout.size();
self.high.fetch_max(allocated, Ordering::Relaxed);
unsafe { System.alloc(layout) }
}
unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
self.deallocations.fetch_add(1, Ordering::Relaxed);
self.allocated.fetch_sub(layout.size(), Ordering::Relaxed);
unsafe { System.dealloc(ptr, layout) }
}
}
#[global_allocator]
static GLOBAL: MyAllocator = MyAllocator {
allocations: AtomicU64::new(0),
deallocations: AtomicU64::new(0),
allocated: AtomicUsize::new(0),
high: AtomicUsize::new(0),
};
pub struct MyAllocatorCollector {
allocations: IntGauge,
deallocations: IntGauge,
allocated: IntGauge,
high: IntGauge,
}
impl MyAllocatorCollector {
pub fn new() -> MyAllocatorCollector {
MyAllocatorCollector {
allocations: IntGauge::new("allocations_total", "Number of allocations in Rust code")
.unwrap(),
deallocations: IntGauge::new(
"deallocations_total",
"Number of deallocations in Rust code",
)
.unwrap(),
allocated: IntGauge::new("allocated_total", "Bytes currently allocated").unwrap(),
high: IntGauge::new("allocated_high", "High watermark of allocated bytes").unwrap(),
}
}
}
impl metrics::core::Collector for MyAllocatorCollector {
fn desc(&self) -> Vec<&metrics::core::Desc> {
let mut descs = Vec::new();
descs.append(&mut self.allocations.desc());
descs.append(&mut self.deallocations.desc());
descs.append(&mut self.allocated.desc());
descs.append(&mut self.high.desc());
descs
}
fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
let mut values = Vec::new();
// update the gauges
self.allocations
.set(GLOBAL.allocations.load(Ordering::Relaxed) as i64);
self.deallocations
.set(GLOBAL.allocations.load(Ordering::Relaxed) as i64);
self.allocated
.set(GLOBAL.allocated.load(Ordering::Relaxed) as i64);
self.high.set(GLOBAL.high.load(Ordering::Relaxed) as i64);
values.append(&mut self.allocations.collect());
values.append(&mut self.deallocations.collect());
values.append(&mut self.allocated.collect());
values.append(&mut self.high.collect());
values
}
}

View File

@@ -1,166 +0,0 @@
//! Initialization functions. These are executed in the postmaster process,
//! at different stages of server startup.
//!
//!
//! Communicator initialization steps:
//!
//! 1. At postmaster startup, before shared memory is allocated,
//! rcommunicator_shmem_size() is called to get the amount of
//! shared memory that this module needs.
//!
//! 2. Later, after the shared memory has been allocated,
//! rcommunicator_shmem_init() is called to initialize the shmem
//! area.
//!
//! Per process initialization:
//!
//! When a backend process starts up, it calls rcommunicator_backend_init().
//! In the communicator worker process, other functions are called, see
//! `worker_process` module.
use std::ffi::c_int;
use std::mem;
use std::mem::MaybeUninit;
use std::os::fd::OwnedFd;
use crate::backend_comms::NeonIORequestSlot;
use crate::integrated_cache::IntegratedCacheInitStruct;
/// This struct is created in the postmaster process, and inherited to
/// the communicator process and all backend processes through fork()
#[repr(C)]
pub struct CommunicatorInitStruct {
pub submission_pipe_read_fd: OwnedFd,
pub submission_pipe_write_fd: OwnedFd,
// Shared memory data structures
pub num_neon_request_slots: u32,
pub neon_request_slots: &'static [NeonIORequestSlot],
pub integrated_cache_init_struct: IntegratedCacheInitStruct<'static>,
}
impl std::fmt::Debug for CommunicatorInitStruct {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
fmt.debug_struct("CommunicatorInitStruct")
.field("submission_pipe_read_fd", &self.submission_pipe_read_fd)
.field("submission_pipe_write_fd", &self.submission_pipe_write_fd)
.field("num_neon_request_slots", &self.num_neon_request_slots)
.field("neon_request_slots length", &self.neon_request_slots.len())
.finish()
}
}
#[unsafe(no_mangle)]
pub extern "C" fn rcommunicator_shmem_size(num_neon_request_slots: u32) -> u64 {
let mut size = 0;
size += mem::size_of::<NeonIORequestSlot>() * num_neon_request_slots as usize;
// For integrated_cache's Allocator. TODO: make this adjustable
size += IntegratedCacheInitStruct::shmem_size();
size as u64
}
/// Initialize the shared memory segment. Returns a backend-private
/// struct, which will be inherited by backend processes through fork
#[unsafe(no_mangle)]
pub extern "C" fn rcommunicator_shmem_init(
submission_pipe_read_fd: c_int,
submission_pipe_write_fd: c_int,
num_neon_request_slots: u32,
shmem_area_ptr: *mut MaybeUninit<u8>,
shmem_area_len: u64,
initial_file_cache_size: u64,
max_file_cache_size: u64,
) -> &'static mut CommunicatorInitStruct {
let shmem_area: &'static mut [MaybeUninit<u8>] =
unsafe { std::slice::from_raw_parts_mut(shmem_area_ptr, shmem_area_len as usize) };
let (neon_request_slots, remaining_area) =
alloc_array_from_slice::<NeonIORequestSlot>(shmem_area, num_neon_request_slots as usize);
for slot in neon_request_slots.iter_mut() {
slot.write(NeonIORequestSlot::default());
}
// 'neon_request_slots' is initialized now. (MaybeUninit::slice_assume_init_mut() is nightly-only
// as of this writing.)
let neon_request_slots = unsafe {
std::mem::transmute::<&mut [MaybeUninit<NeonIORequestSlot>], &mut [NeonIORequestSlot]>(
neon_request_slots,
)
};
// Give the rest of the area to the integrated cache
let integrated_cache_init_struct = IntegratedCacheInitStruct::shmem_init(
remaining_area,
initial_file_cache_size,
max_file_cache_size,
);
let (submission_pipe_read_fd, submission_pipe_write_fd) = unsafe {
use std::os::fd::FromRawFd;
(
OwnedFd::from_raw_fd(submission_pipe_read_fd),
OwnedFd::from_raw_fd(submission_pipe_write_fd),
)
};
let cis: &'static mut CommunicatorInitStruct = Box::leak(Box::new(CommunicatorInitStruct {
submission_pipe_read_fd,
submission_pipe_write_fd,
num_neon_request_slots,
neon_request_slots,
integrated_cache_init_struct,
}));
cis
}
pub fn alloc_from_slice<T>(
area: &mut [MaybeUninit<u8>],
) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
let layout = std::alloc::Layout::new::<T>();
let area_start = area.as_mut_ptr();
// pad to satisfy alignment requirements
let padding = area_start.align_offset(layout.align());
if padding + layout.size() > area.len() {
panic!("out of memory");
}
let area = &mut area[padding..];
let (result_area, remain) = area.split_at_mut(layout.size());
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
let result = unsafe { result_ptr.as_mut().unwrap() };
(result, remain)
}
pub fn alloc_array_from_slice<T>(
area: &mut [MaybeUninit<u8>],
len: usize,
) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
let layout = std::alloc::Layout::new::<T>();
let area_start = area.as_mut_ptr();
// pad to satisfy alignment requirements
let padding = area_start.align_offset(layout.align());
if padding + layout.size() * len > area.len() {
panic!("out of memory");
}
let area = &mut area[padding..];
let (result_area, remain) = area.split_at_mut(layout.size() * len);
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
(result, remain)
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,29 +1,5 @@
//! Three main parts:
//! - async tokio communicator core, which receives requests and processes them.
//! - Main loop and requests queues, which routes requests from backends to the core
//! - the per-backend glue code, which submits requests
mod backend_comms;
// mark this 'pub', because these functions are called from C code. Otherwise, the compiler
// complains about a bunch of structs and enum variants being unused, because it thinkgs
// the functions that use them are never called. There are some C-callable functions in
// other modules too, but marking this as pub is currently enough to silence the warnings
//
// TODO: perhaps collect *all* the extern "C" functions to one module?
pub mod backend_interface;
mod file_cache;
mod init;
mod integrated_cache;
mod neon_request;
mod worker_process;
mod global_allocator;
/// Name of the Unix Domain Socket that serves the metrics, and other APIs in the
/// future. This is within the Postgres data directory.
const NEON_COMMUNICATOR_SOCKET_NAME: &str = "neon-communicator.socket";
// FIXME: get this from postgres headers somehow
pub const BLCKSZ: usize = 8192;

View File

@@ -1,466 +0,0 @@
// Definitions of some core PostgreSQL datatypes.
/// XLogRecPtr is defined in "access/xlogdefs.h" as:
///
/// ```
/// typedef uint64 XLogRecPtr;
/// ```
/// cbindgen:no-export
pub type XLogRecPtr = u64;
pub type CLsn = XLogRecPtr;
pub type COid = u32;
// This conveniently matches PG_IOV_MAX
pub const MAX_GETPAGEV_PAGES: usize = 32;
pub const INVALID_BLOCK_NUMBER: u32 = u32::MAX;
use std::ffi::CStr;
use pageserver_page_api::{self as page_api, SlruKind};
/// Request from a Postgres backend to the communicator process
#[allow(clippy::large_enum_variant)]
#[repr(C)]
#[derive(Copy, Clone, Debug, strum_macros::EnumDiscriminants)]
#[strum_discriminants(derive(measured::FixedCardinalityLabel))]
pub enum NeonIORequest {
Empty,
// Read requests. These are C-friendly variants of the corresponding structs in
// pageserver_page_api.
RelSize(CRelSizeRequest),
GetPageV(CGetPageVRequest),
ReadSlruSegment(CReadSlruSegmentRequest),
PrefetchV(CPrefetchVRequest),
DbSize(CDbSizeRequest),
/// This is like GetPageV, but bypasses the LFC and allows specifiying the
/// request LSNs directly. For debugging purposes only.
GetPageVUncached(CGetPageVUncachedRequest),
// Write requests. These are needed to keep the relation size cache and LFC up-to-date.
// They are not sent to the pageserver.
WritePage(CWritePageRequest),
RelExtend(CRelExtendRequest),
RelZeroExtend(CRelZeroExtendRequest),
RelCreate(CRelCreateRequest),
RelTruncate(CRelTruncateRequest),
RelUnlink(CRelUnlinkRequest),
// Other requests
UpdateCachedRelSize(CUpdateCachedRelSizeRequest),
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub enum NeonIOResult {
Empty,
/// InvalidBlockNumber == 0xffffffff means "rel does not exist"
RelSize(u32),
/// the result pages are written to the shared memory addresses given in the request
GetPageV,
/// The result is written to the file, path to which is provided
/// in the request. The [`u64`] value here is the number of blocks.
ReadSlruSegment(u64),
/// A prefetch request returns as soon as the request has been received by the communicator.
/// It is processed in the background.
PrefetchVLaunched,
DbSize(u64),
// FIXME design compact error codes. Can't easily pass a string or other dynamic data.
// currently, this is 'errno'
Error(i32),
Aborted,
/// used for all write requests
WriteOK,
}
impl NeonIORequest {
/// All requests include a unique request ID, which can be used to trace the execution
/// of a request all the way to the pageservers. The request ID needs to be unique
/// within the lifetime of the Postgres instance (but not across servers or across
/// restarts of the same server).
pub fn request_id(&self) -> u64 {
use NeonIORequest::*;
match self {
Empty => 0,
RelSize(req) => req.request_id,
GetPageV(req) => req.request_id,
GetPageVUncached(req) => req.request_id,
ReadSlruSegment(req) => req.request_id,
PrefetchV(req) => req.request_id,
DbSize(req) => req.request_id,
WritePage(req) => req.request_id,
RelExtend(req) => req.request_id,
RelZeroExtend(req) => req.request_id,
RelCreate(req) => req.request_id,
RelTruncate(req) => req.request_id,
RelUnlink(req) => req.request_id,
UpdateCachedRelSize(req) => req.request_id,
}
}
}
/// Special quick result to a CGetPageVRequest request, indicating that the
/// the requested pages are present in the local file cache. The backend can
/// read the blocks directly from the given LFC blocks.
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CCachedGetPageVResult {
pub cache_block_numbers: [u64; MAX_GETPAGEV_PAGES],
}
/// ShmemBuf represents a buffer in shared memory.
///
/// SAFETY: The pointer must point to an area in shared memory. The functions allow you to liberally
/// get a mutable pointer to the contents; it is the caller's responsibility to ensure that you
/// don't access a buffer that's you're not allowed to. Inappropriate access to the buffer doesn't
/// violate Rust's safety semantics, but it will mess up and crash Postgres.
///
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct ShmemBuf {
// Pointer to where the result is written or where to read from. Must point into a buffer in shared memory!
pub ptr: *mut u8,
}
unsafe impl Send for ShmemBuf {}
unsafe impl Sync for ShmemBuf {}
unsafe impl uring_common::buf::IoBuf for ShmemBuf {
fn stable_ptr(&self) -> *const u8 {
self.ptr
}
fn bytes_init(&self) -> usize {
crate::BLCKSZ
}
fn bytes_total(&self) -> usize {
crate::BLCKSZ
}
}
unsafe impl uring_common::buf::IoBufMut for ShmemBuf {
fn stable_mut_ptr(&mut self) -> *mut u8 {
self.ptr
}
unsafe fn set_init(&mut self, pos: usize) {
if pos > crate::BLCKSZ {
panic!(
"set_init called past end of buffer, pos {}, buffer size {}",
pos,
crate::BLCKSZ
);
}
}
}
impl ShmemBuf {
pub fn as_mut_ptr(&self) -> *mut u8 {
self.ptr
}
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelSizeRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub allow_missing: bool,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CGetPageVRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u8,
// These fields define where the result is written. Must point into a buffer in shared memory!
pub dest: [ShmemBuf; MAX_GETPAGEV_PAGES],
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CGetPageVUncachedRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u8,
pub request_lsn: CLsn,
pub not_modified_since: CLsn,
// These fields define where the result is written. Must point into a buffer in shared memory!
pub dest: [ShmemBuf; MAX_GETPAGEV_PAGES],
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CReadSlruSegmentRequest {
pub request_id: u64,
pub slru_kind: SlruKind,
pub segment_number: u32,
pub request_lsn: CLsn,
/// Must be a null-terminated C string containing the file path
/// where the communicator will write the SLRU segment.
pub destination_file_path: ShmemBuf,
}
impl CReadSlruSegmentRequest {
/// Returns the file path where the communicator will write the
/// SLRU segment.
pub(crate) fn destination_file_path(&self) -> String {
unsafe { CStr::from_ptr(self.destination_file_path.as_mut_ptr() as *const _) }
.to_string_lossy()
.into_owned()
}
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CPrefetchVRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u8,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CDbSizeRequest {
pub request_id: u64,
pub db_oid: COid,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CWritePageRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub lsn: CLsn,
// `src` defines the new page contents. Must point into a buffer in shared memory!
pub src: ShmemBuf,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelExtendRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub lsn: CLsn,
// `src` defines the new page contents. Must point into a buffer in shared memory!
pub src: ShmemBuf,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelZeroExtendRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u32,
pub lsn: CLsn,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelCreateRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub lsn: CLsn,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelTruncateRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub nblocks: u32,
pub lsn: CLsn,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelUnlinkRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub lsn: CLsn,
}
impl CRelSizeRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CGetPageVRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CGetPageVUncachedRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CPrefetchVRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CWritePageRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelExtendRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelZeroExtendRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelCreateRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelTruncateRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelUnlinkRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CUpdateCachedRelSizeRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub nblocks: u32,
pub lsn: CLsn,
}
impl CUpdateCachedRelSizeRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}

View File

@@ -4,13 +4,10 @@
//!
//! These are called from the communicator threads! Careful what you do, most Postgres
//! functions are not safe to call in that context.
use utils::lsn::Lsn;
#[cfg(not(test))]
unsafe extern "C" {
pub fn notify_proc_unsafe(procno: std::ffi::c_int);
pub fn callback_set_my_latch_unsafe();
pub fn callback_get_request_lsn_unsafe() -> crate::neon_request::CLsn;
pub fn callback_get_lfc_metrics_unsafe() -> LfcMetrics;
}
@@ -19,36 +16,20 @@ unsafe extern "C" {
// package, but the code coverage build still builds these and tries to link with the
// external C code.)
#[cfg(test)]
unsafe fn notify_proc_unsafe(_procno: std::ffi::c_int) {
panic!("not usable in unit tests");
}
#[cfg(test)]
unsafe fn callback_set_my_latch_unsafe() {
panic!("not usable in unit tests");
}
#[cfg(test)]
unsafe fn callback_get_request_lsn_unsafe() -> crate::neon_request::CLsn {
panic!("not usable in unit tests");
}
#[cfg(test)]
unsafe fn callback_get_lfc_metrics_unsafe() -> LfcMetrics {
panic!("not usable in unit tests");
}
// safe wrappers
pub(super) fn notify_proc(procno: std::ffi::c_int) {
unsafe { notify_proc_unsafe(procno) };
}
pub(super) fn callback_set_my_latch() {
unsafe { callback_set_my_latch_unsafe() };
}
pub(super) fn get_request_lsn() -> Lsn {
Lsn(unsafe { callback_get_request_lsn_unsafe() })
}
pub(super) fn callback_get_lfc_metrics() -> LfcMetrics {
unsafe { callback_get_lfc_metrics_unsafe() }
}

Some files were not shown because too many files have changed in this diff Show More