Compare commits

..

4 Commits

Author SHA1 Message Date
BodoBolero
be2ad49a62 move benchmarks to another repo 2025-08-12 19:03:20 +02:00
Ruslan Talpa
d96cea1917 [proxy] handle options request in rest broker (cors headers) (#12744)
## Problem
rest broker needs to respond with the correct cors headers for the api
to be usable from other domains

## Summary of changes
added a code path in rest broker to handle the OPTIONS requests

---------

Co-authored-by: Ruslan Talpa <ruslan.talpa@databricks.com>
2025-07-31 13:05:09 +00:00
Dmitrii Kovalkov
312a74f11f storcon: implement safekeeper_migrate_abort handler (#12705)
## Problem
Right now if we commit a joint configuration to DB, there is no way
back. The only way to get the clean mconf is to continue the migration.
The RFC also described an abort mechanism, which allows to abort current
migration and revert mconf change. It might be needed if the migration
is stuck and cannot have any progress, e.g. if the sk we are migrating
to went down during the migration. This PR implements this abort
algorithm.

- Closes: https://databricks.atlassian.net/browse/LKB-899
- Closes: https://github.com/neondatabase/neon/issues/12549

## Summary of changes
- Implement `safekeeper_migrate_abort` handler with the algorithm
described in RFC
- Add `timeline-safekeeper-migrate-abort` subcommand to `storcon_cli`
- Add test for the migration abort algorithm.
2025-07-31 12:40:32 +00:00
Mikhail
df4e37b7cc Report timespans for promotion and prewarm (#12730)
- Return sub-actions time spans for prewarm, prewarm offload, and
promotion in http handlers.
- Set `synchronous_standby_names=walproposer` for promoted endpoints.
Otherwise, walproposer on promoted standby ignores reply from safekeeper
and is stuck on lsn COMMIT eternally.
2025-07-31 11:51:19 +00:00
79 changed files with 1500 additions and 8012 deletions

View File

@@ -300,7 +300,9 @@ jobs:
benchmarks:
# `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `deploy` in PRs
if: github.ref_name == 'main' || (contains(github.event.pull_request.labels.*.name, 'run-benchmarks') && !failure() && !cancelled())
# if: github.ref_name == 'main' || (contains(github.event.pull_request.labels.*.name, 'run-benchmarks') && !failure() && !cancelled())
# moved to another repo
if: false
needs: [ check-permissions, build-build-tools-image, get-benchmarks-durations, deploy ]
permissions:
id-token: write # aws-actions/configure-aws-credentials

1
.gitignore vendored
View File

@@ -15,7 +15,6 @@ neon.iml
/.neon
/integration_tests/.neon
compaction-suite-results.*
pgxn/neon/communicator/communicator_bindings.h
docker-compose/docker-compose-parallel.yml
# Coverage

69
Cargo.lock generated
View File

@@ -259,17 +259,6 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
[[package]]
name = "atomic_enum"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99e1aca718ea7b89985790c94aad72d77533063fe00bc497bb79a7c2dae6a661"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
]
[[package]]
name = "autocfg"
version = "1.1.0"
@@ -1307,31 +1296,13 @@ dependencies = [
name = "communicator"
version = "0.1.0"
dependencies = [
"atomic_enum",
"axum",
"bytes",
"cbindgen",
"clashmap",
"fiemap",
"http 1.3.1",
"itertools 0.10.5",
"libc",
"measured",
"neon-shmem",
"nix 0.30.1",
"pageserver_api",
"pageserver_client_grpc",
"pageserver_page_api",
"prometheus",
"prost 0.13.5",
"strum_macros",
"thiserror 1.0.69",
"tokio",
"tokio-pipe",
"tonic",
"tracing",
"tracing-subscriber",
"uring-common",
"utils",
"workspace_hack",
]
@@ -1672,9 +1643,9 @@ dependencies = [
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
version = "0.8.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
[[package]]
name = "crossterm"
@@ -2338,15 +2309,6 @@ version = "0.2.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d"
[[package]]
name = "fiemap"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "92e387bc8b3342ba5cd115fb566e6bf2c82562433dffcecbc2474265cf8a875a"
dependencies = [
"bitflags 2.8.0",
]
[[package]]
name = "filetime"
version = "0.2.22"
@@ -4429,16 +4391,13 @@ version = "0.1.0"
dependencies = [
"anyhow",
"async-trait",
"axum",
"bytes",
"camino",
"clap",
"futures",
"hdrhistogram",
"http 1.3.1",
"humantime",
"humantime-serde",
"metrics",
"pageserver_api",
"pageserver_client",
"pageserver_client_grpc",
@@ -4528,7 +4487,6 @@ dependencies = [
"pageserver_client",
"pageserver_compaction",
"pageserver_page_api",
"peekable",
"pem",
"pin-project-lite",
"postgres-protocol",
@@ -4542,7 +4500,6 @@ dependencies = [
"pprof",
"pq_proto",
"procfs",
"prost 0.13.5",
"rand 0.9.1",
"range-set-blaze",
"regex",
@@ -4837,15 +4794,6 @@ dependencies = [
"sha2",
]
[[package]]
name = "peekable"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "225f9651e475709164f871dc2f5724956be59cb9edb055372ffeeab01ec2d20b"
dependencies = [
"smallvec",
]
[[package]]
name = "pem"
version = "3.0.3"
@@ -7698,16 +7646,6 @@ dependencies = [
"syn 2.0.100",
]
[[package]]
name = "tokio-pipe"
version = "0.2.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f213a84bffbd61b8fa0ba8a044b4bbe35d471d0b518867181e82bd5c15542784"
dependencies = [
"libc",
"tokio",
]
[[package]]
name = "tokio-postgres"
version = "0.7.10"
@@ -9075,8 +9013,8 @@ dependencies = [
"clap",
"clap_builder",
"const-oid",
"criterion",
"crossbeam-epoch",
"crossbeam-utils",
"crypto-bigint 0.5.5",
"der 0.7.8",
"deranged",
@@ -9119,6 +9057,7 @@ dependencies = [
"num-iter",
"num-rational",
"num-traits",
"once_cell",
"p256 0.13.2",
"parquet",
"portable-atomic",

View File

@@ -93,7 +93,6 @@ clap = { version = "4.0", features = ["derive", "env"] }
clashmap = { version = "1.0", features = ["raw-api"] }
comfy-table = "7.1"
const_format = "0.2"
crossbeam-utils = "0.8.21"
crc32c = "0.6"
diatomic-waker = { version = "0.2.3" }
either = "1.8"
@@ -153,7 +152,6 @@ parquet = { version = "53", default-features = false, features = ["zstd"] }
parquet_derive = "53"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pem = "3.0.3"
peekable = "0.3.0"
pin-project-lite = "0.2"
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
procfs = "0.16"
@@ -192,7 +190,6 @@ smallvec = "1.11"
smol_str = { version = "0.2.0", features = ["serde"] }
socket2 = "0.5"
spki = "0.7.3"
spin = "0.9.8"
strum = "0.26"
strum_macros = "0.26"
"subtle" = "2.5.0"
@@ -204,6 +201,7 @@ thiserror = "1.0"
tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
tokio = { version = "1.43.1", features = ["macros"] }
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
tokio-io-timeout = "1.2.0"
tokio-postgres-rustls = "0.12.0"
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
@@ -244,9 +242,6 @@ zeroize = "1.8"
env_logger = "0.11"
log = "0.4"
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }

View File

@@ -2780,7 +2780,7 @@ LIMIT 100",
// 4. We start again and try to prewarm with the state from 2. instead of the previous complete state
if matches!(
prewarm_state,
LfcPrewarmState::Completed
LfcPrewarmState::Completed { .. }
| LfcPrewarmState::NotPrewarmed
| LfcPrewarmState::Skipped
) {

View File

@@ -7,19 +7,11 @@ use http::StatusCode;
use reqwest::Client;
use std::mem::replace;
use std::sync::Arc;
use std::time::Instant;
use tokio::{io::AsyncReadExt, select, spawn};
use tokio_util::sync::CancellationToken;
use tracing::{error, info};
#[derive(serde::Serialize, Default)]
pub struct LfcPrewarmStateWithProgress {
#[serde(flatten)]
base: LfcPrewarmState,
total: i32,
prewarmed: i32,
skipped: i32,
}
/// A pair of url and a token to query endpoint storage for LFC prewarm-related tasks
struct EndpointStoragePair {
url: String,
@@ -28,7 +20,7 @@ struct EndpointStoragePair {
const KEY: &str = "lfc_state";
impl EndpointStoragePair {
/// endpoint_id is set to None while prewarming from other endpoint, see replica promotion
/// endpoint_id is set to None while prewarming from other endpoint, see compute_promote.rs
/// If not None, takes precedence over pspec.spec.endpoint_id
fn from_spec_and_endpoint(
pspec: &crate::compute::ParsedSpec,
@@ -54,36 +46,8 @@ impl EndpointStoragePair {
}
impl ComputeNode {
// If prewarm failed, we want to get overall number of segments as well as done ones.
// However, this function should be reliable even if querying postgres failed.
pub async fn lfc_prewarm_state(&self) -> LfcPrewarmStateWithProgress {
info!("requesting LFC prewarm state from postgres");
let mut state = LfcPrewarmStateWithProgress::default();
{
state.base = self.state.lock().unwrap().lfc_prewarm_state.clone();
}
let client = match ComputeNode::get_maintenance_client(&self.tokio_conn_conf).await {
Ok(client) => client,
Err(err) => {
error!(%err, "connecting to postgres");
return state;
}
};
let row = match client
.query_one("select * from neon.get_prewarm_info()", &[])
.await
{
Ok(row) => row,
Err(err) => {
error!(%err, "querying LFC prewarm status");
return state;
}
};
state.total = row.try_get(0).unwrap_or_default();
state.prewarmed = row.try_get(1).unwrap_or_default();
state.skipped = row.try_get(2).unwrap_or_default();
state
pub async fn lfc_prewarm_state(&self) -> LfcPrewarmState {
self.state.lock().unwrap().lfc_prewarm_state.clone()
}
pub fn lfc_offload_state(&self) -> LfcOffloadState {
@@ -133,7 +97,6 @@ impl ComputeNode {
}
/// Request LFC state from endpoint storage and load corresponding pages into Postgres.
/// Returns a result with `false` if the LFC state is not found in endpoint storage.
async fn prewarm_impl(
&self,
from_endpoint: Option<String>,
@@ -148,6 +111,7 @@ impl ComputeNode {
fail::fail_point!("compute-prewarm", |_| bail!("compute-prewarm failpoint"));
info!(%url, "requesting LFC state from endpoint storage");
let mut now = Instant::now();
let request = Client::new().get(&url).bearer_auth(storage_token);
let response = select! {
_ = token.cancelled() => return Ok(LfcPrewarmState::Cancelled),
@@ -160,6 +124,8 @@ impl ComputeNode {
StatusCode::NOT_FOUND => return Ok(LfcPrewarmState::Skipped),
status => bail!("{status} querying endpoint storage"),
}
let state_download_time_ms = now.elapsed().as_millis() as u32;
now = Instant::now();
let mut uncompressed = Vec::new();
let lfc_state = select! {
@@ -174,6 +140,8 @@ impl ComputeNode {
read = decoder.read_to_end(&mut uncompressed) => read
}
.context("decoding LFC state")?;
let uncompress_time_ms = now.elapsed().as_millis() as u32;
now = Instant::now();
let uncompressed_len = uncompressed.len();
info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}");
@@ -196,15 +164,34 @@ impl ComputeNode {
}
.context("loading LFC state into postgres")
.map(|_| ())?;
let prewarm_time_ms = now.elapsed().as_millis() as u32;
Ok(LfcPrewarmState::Completed)
let row = client
.query_one("select * from neon.get_prewarm_info()", &[])
.await
.context("querying prewarm info")?;
let total = row.try_get(0).unwrap_or_default();
let prewarmed = row.try_get(1).unwrap_or_default();
let skipped = row.try_get(2).unwrap_or_default();
Ok(LfcPrewarmState::Completed {
total,
prewarmed,
skipped,
state_download_time_ms,
uncompress_time_ms,
prewarm_time_ms,
})
}
/// If offload request is ongoing, return false, true otherwise
pub fn offload_lfc(self: &Arc<Self>) -> bool {
{
let state = &mut self.state.lock().unwrap().lfc_offload_state;
if replace(state, LfcOffloadState::Offloading) == LfcOffloadState::Offloading {
if matches!(
replace(state, LfcOffloadState::Offloading),
LfcOffloadState::Offloading
) {
return false;
}
}
@@ -216,7 +203,10 @@ impl ComputeNode {
pub async fn offload_lfc_async(self: &Arc<Self>) {
{
let state = &mut self.state.lock().unwrap().lfc_offload_state;
if replace(state, LfcOffloadState::Offloading) == LfcOffloadState::Offloading {
if matches!(
replace(state, LfcOffloadState::Offloading),
LfcOffloadState::Offloading
) {
return;
}
}
@@ -234,7 +224,6 @@ impl ComputeNode {
LfcOffloadState::Failed { error }
}
};
self.state.lock().unwrap().lfc_offload_state = state;
}
@@ -242,6 +231,7 @@ impl ComputeNode {
let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?;
info!(%url, "requesting LFC state from Postgres");
let mut now = Instant::now();
let row = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
.await
.context("connecting to postgres")?
@@ -255,25 +245,36 @@ impl ComputeNode {
info!(%url, "empty LFC state, not exporting");
return Ok(LfcOffloadState::Skipped);
};
let state_query_time_ms = now.elapsed().as_millis() as u32;
now = Instant::now();
let mut compressed = Vec::new();
ZstdEncoder::new(state)
.read_to_end(&mut compressed)
.await
.context("compressing LFC state")?;
let compress_time_ms = now.elapsed().as_millis() as u32;
now = Instant::now();
let compressed_len = compressed.len();
info!(%url, "downloaded LFC state, compressed size {compressed_len}, writing to endpoint storage");
info!(%url, "downloaded LFC state, compressed size {compressed_len}");
let request = Client::new().put(url).bearer_auth(token).body(compressed);
match request.send().await {
Ok(res) if res.status() == StatusCode::OK => Ok(LfcOffloadState::Completed),
Ok(res) => bail!(
"Request to endpoint storage failed with status: {}",
res.status()
),
Err(err) => Err(err).context("writing to endpoint storage"),
let response = request
.send()
.await
.context("writing to endpoint storage")?;
let state_upload_time_ms = now.elapsed().as_millis() as u32;
let status = response.status();
if status != StatusCode::OK {
bail!("request to endpoint storage failed: {status}");
}
Ok(LfcOffloadState::Completed {
compress_time_ms,
state_query_time_ms,
state_upload_time_ms,
})
}
pub fn cancel_prewarm(self: &Arc<Self>) {

View File

@@ -1,32 +1,24 @@
use crate::compute::ComputeNode;
use anyhow::{Context, Result, bail};
use anyhow::{Context, bail};
use compute_api::responses::{LfcPrewarmState, PromoteConfig, PromoteState};
use compute_api::spec::ComputeMode;
use itertools::Itertools;
use std::collections::HashMap;
use std::{sync::Arc, time::Duration};
use tokio::time::sleep;
use std::time::Instant;
use tracing::info;
use utils::lsn::Lsn;
impl ComputeNode {
/// Returns only when promote fails or succeeds. If a network error occurs
/// and http client disconnects, this does not stop promotion, and subsequent
/// calls block until promote finishes.
/// Returns only when promote fails or succeeds. If http client calling this function
/// disconnects, this does not stop promotion, and subsequent calls block until promote finishes.
/// Called by control plane on secondary after primary endpoint is terminated
/// Has a failpoint "compute-promotion"
pub async fn promote(self: &Arc<Self>, cfg: PromoteConfig) -> PromoteState {
let cloned = self.clone();
let promote_fn = async move || {
let Err(err) = cloned.promote_impl(cfg).await else {
return PromoteState::Completed;
};
tracing::error!(%err, "promoting");
PromoteState::Failed {
error: format!("{err:#}"),
pub async fn promote(self: &std::sync::Arc<Self>, cfg: PromoteConfig) -> PromoteState {
let this = self.clone();
let promote_fn = async move || match this.promote_impl(cfg).await {
Ok(state) => state,
Err(err) => {
tracing::error!(%err, "promoting replica");
let error = format!("{err:#}");
PromoteState::Failed { error }
}
};
let start_promotion = || {
let (tx, rx) = tokio::sync::watch::channel(PromoteState::NotPromoted);
tokio::spawn(async move { tx.send(promote_fn().await) });
@@ -34,36 +26,31 @@ impl ComputeNode {
};
let mut task;
// self.state is unlocked after block ends so we lock it in promote_impl
// and task.changed() is reached
// promote_impl locks self.state so we need to unlock it before calling task.changed()
{
task = self
.state
.lock()
.unwrap()
.promote_state
.get_or_insert_with(start_promotion)
.clone()
let promote_state = &mut self.state.lock().unwrap().promote_state;
task = promote_state.get_or_insert_with(start_promotion).clone()
}
if task.changed().await.is_err() {
let error = "promote sender dropped".to_string();
return PromoteState::Failed { error };
}
task.changed().await.expect("promote sender dropped");
task.borrow().clone()
}
async fn promote_impl(&self, mut cfg: PromoteConfig) -> Result<()> {
async fn promote_impl(&self, cfg: PromoteConfig) -> anyhow::Result<PromoteState> {
{
let state = self.state.lock().unwrap();
let mode = &state.pspec.as_ref().unwrap().spec.mode;
if *mode != ComputeMode::Replica {
bail!("{} is not replica", mode.to_type_str());
if *mode != compute_api::spec::ComputeMode::Replica {
bail!("compute mode \"{}\" is not replica", mode.to_type_str());
}
// we don't need to query Postgres so not self.lfc_prewarm_state()
match &state.lfc_prewarm_state {
LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming => {
bail!("prewarm not requested or pending")
status @ (LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming) => {
bail!("compute {status}")
}
LfcPrewarmState::Failed { error } => {
tracing::warn!(%error, "replica prewarm failed")
tracing::warn!(%error, "compute prewarm failed")
}
_ => {}
}
@@ -72,9 +59,10 @@ impl ComputeNode {
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
.await
.context("connecting to postgres")?;
let mut now = Instant::now();
let primary_lsn = cfg.wal_flush_lsn;
let mut last_wal_replay_lsn: Lsn = Lsn::INVALID;
let mut standby_lsn = utils::lsn::Lsn::INVALID;
const RETRIES: i32 = 20;
for i in 0..=RETRIES {
let row = client
@@ -82,16 +70,18 @@ impl ComputeNode {
.await
.context("getting last replay lsn")?;
let lsn: u64 = row.get::<usize, postgres_types::PgLsn>(0).into();
last_wal_replay_lsn = lsn.into();
if last_wal_replay_lsn >= primary_lsn {
standby_lsn = lsn.into();
if standby_lsn >= primary_lsn {
break;
}
info!("Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}");
sleep(Duration::from_secs(1)).await;
info!(%standby_lsn, %primary_lsn, "catching up, try {i}");
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
}
if last_wal_replay_lsn < primary_lsn {
if standby_lsn < primary_lsn {
bail!("didn't catch up with primary in {RETRIES} retries");
}
let lsn_wait_time_ms = now.elapsed().as_millis() as u32;
now = Instant::now();
// using $1 doesn't work with ALTER SYSTEM SET
let safekeepers_sql = format!(
@@ -102,27 +92,33 @@ impl ComputeNode {
.query(&safekeepers_sql, &[])
.await
.context("setting safekeepers")?;
client
.query(
"ALTER SYSTEM SET synchronous_standby_names=walproposer",
&[],
)
.await
.context("setting synchronous_standby_names")?;
client
.query("SELECT pg_catalog.pg_reload_conf()", &[])
.await
.context("reloading postgres config")?;
#[cfg(feature = "testing")]
fail::fail_point!("compute-promotion", |_| {
bail!("promotion configured to fail because of a failpoint")
});
fail::fail_point!("compute-promotion", |_| bail!(
"compute-promotion failpoint"
));
let row = client
.query_one("SELECT * FROM pg_catalog.pg_promote()", &[])
.await
.context("pg_promote")?;
if !row.get::<usize, bool>(0) {
bail!("pg_promote() returned false");
bail!("pg_promote() failed");
}
let pg_promote_time_ms = now.elapsed().as_millis() as u32;
let now = Instant::now();
let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
.await
.context("connecting to postgres")?;
let row = client
.query_one("SHOW transaction_read_only", &[])
.await
@@ -131,36 +127,47 @@ impl ComputeNode {
bail!("replica in read only mode after promotion");
}
// Already checked validity in http handler
#[allow(unused_mut)]
let mut new_pspec = crate::compute::ParsedSpec::try_from(cfg.spec).expect("invalid spec");
{
let mut state = self.state.lock().unwrap();
let spec = &mut state.pspec.as_mut().unwrap().spec;
spec.mode = ComputeMode::Primary;
let new_conf = cfg.spec.cluster.postgresql_conf.as_mut().unwrap();
let existing_conf = spec.cluster.postgresql_conf.as_ref().unwrap();
Self::merge_spec(new_conf, existing_conf);
// Local setup has different ports for pg process (port=) for primary and secondary.
// Primary is stopped so we need secondary's "port" value
#[cfg(feature = "testing")]
{
let old_spec = &state.pspec.as_ref().unwrap().spec;
let Some(old_conf) = old_spec.cluster.postgresql_conf.as_ref() else {
bail!("pspec.spec.cluster.postgresql_conf missing for endpoint");
};
let set: std::collections::HashMap<&str, &str> = old_conf
.split_terminator('\n')
.map(|e| e.split_once("=").expect("invalid item"))
.collect();
let Some(new_conf) = new_pspec.spec.cluster.postgresql_conf.as_mut() else {
bail!("pspec.spec.cluster.postgresql_conf missing for supplied config");
};
new_conf.push_str(&format!("port={}\n", set["port"]));
}
tracing::debug!("applied spec: {:#?}", new_pspec.spec);
if self.params.lakebase_mode {
ComputeNode::set_spec(&self.params, &mut state, new_pspec);
} else {
state.pspec = Some(new_pspec);
}
}
info!("applied new spec, reconfiguring as primary");
self.reconfigure()
}
self.reconfigure()?;
let reconfigure_time_ms = now.elapsed().as_millis() as u32;
/// Merge old and new Postgres conf specs to apply on secondary.
/// Change new spec's port and safekeepers since they are supplied
/// differenly
fn merge_spec(new_conf: &mut String, existing_conf: &str) {
let mut new_conf_set: HashMap<&str, &str> = new_conf
.split_terminator('\n')
.map(|e| e.split_once("=").expect("invalid item"))
.collect();
new_conf_set.remove("neon.safekeepers");
let existing_conf_set: HashMap<&str, &str> = existing_conf
.split_terminator('\n')
.map(|e| e.split_once("=").expect("invalid item"))
.collect();
new_conf_set.insert("port", existing_conf_set["port"]);
*new_conf = new_conf_set
.iter()
.map(|(k, v)| format!("{k}={v}"))
.join("\n");
Ok(PromoteState::Completed {
lsn_wait_time_ms,
pg_promote_time_ms,
reconfigure_time_ms,
})
}
}

View File

@@ -8,7 +8,7 @@ use std::path::Path;
use compute_api::responses::TlsConfig;
use compute_api::spec::{
ComputeAudit, ComputeMode, ComputeSpec, DatabricksSettings, GenericOption, PageserverProtocol,
ComputeAudit, ComputeMode, ComputeSpec, DatabricksSettings, GenericOption,
};
use crate::compute::ComputeNodeParams;
@@ -69,15 +69,6 @@ pub fn write_postgres_conf(
writeln!(file, "# Neon storage settings")?;
writeln!(file)?;
if let Some(conninfo) = &spec.pageserver_connection_info {
match conninfo.prefer_protocol {
PageserverProtocol::Libpq => {
writeln!(file, "neon.use_communicator_worker=false")?;
}
PageserverProtocol::Grpc => {
writeln!(file, "neon.use_communicator_worker=true")?;
}
}
// Stripe size GUC should be defined prior to connection string
if let Some(stripe_size) = conninfo.stripe_size {
writeln!(
@@ -88,7 +79,6 @@ pub fn write_postgres_conf(
}
let mut libpq_urls: Option<Vec<String>> = Some(Vec::new());
let mut grpc_urls: Option<Vec<String>> = Some(Vec::new());
let num_shards = if conninfo.shard_count.0 == 0 {
1 // unsharded, treat it as a single shard
} else {
@@ -121,14 +111,6 @@ pub fn write_postgres_conf(
} else {
libpq_urls = None
}
// Similarly for gRPC URLs
if let Some(url) = &first_pageserver.grpc_url {
if let Some(ref mut urls) = grpc_urls {
urls.push(url.clone());
}
} else {
grpc_urls = None
}
}
if let Some(libpq_urls) = libpq_urls {
writeln!(
@@ -143,22 +125,7 @@ pub fn write_postgres_conf(
} else {
writeln!(file, "# no neon.pageserver_connstring")?;
}
if let Some(grpc_urls) = grpc_urls {
writeln!(
file,
"# derived from compute spec's pageserver_conninfo field"
)?;
writeln!(
file,
"neon.pageserver_grpc_urls={}",
escape_conf_value(&grpc_urls.join(","))
)?;
} else {
writeln!(file, "# no neon.pageserver_grpc_urls")?;
}
} else {
writeln!(file, "neon.use_communicator_worker=false")?;
// Stripe size GUC should be defined prior to connection string
if let Some(stripe_size) = spec.shard_stripe_size {
writeln!(file, "# from compute spec's shard_stripe_size field")?;

View File

@@ -617,9 +617,6 @@ components:
type: object
required:
- status
- total
- prewarmed
- skipped
properties:
status:
description: LFC prewarm status
@@ -637,6 +634,15 @@ components:
skipped:
description: Pages processed but not prewarmed
type: integer
state_download_time_ms:
description: Time it takes to download LFC state to compute
type: integer
uncompress_time_ms:
description: Time it takes to uncompress LFC state
type: integer
prewarm_time_ms:
description: Time it takes to prewarm LFC state in Postgres
type: integer
LfcOffloadState:
type: object
@@ -650,6 +656,16 @@ components:
error:
description: LFC offload error, if any
type: string
state_query_time_ms:
description: Time it takes to get LFC state from Postgres
type: integer
compress_time_ms:
description: Time it takes to compress LFC state
type: integer
state_upload_time_ms:
description: Time it takes to upload LFC state to endpoint storage
type: integer
PromoteState:
type: object
@@ -663,6 +679,15 @@ components:
error:
description: Promote error, if any
type: string
lsn_wait_time_ms:
description: Time it takes for secondary to catch up with primary WAL flush LSN
type: integer
pg_promote_time_ms:
description: Time it takes to call pg_promote on secondary
type: integer
reconfigure_time_ms:
description: Time it takes to reconfigure promoted secondary
type: integer
SetRoleGrantsRequest:
type: object

View File

@@ -1,12 +1,11 @@
use crate::compute_prewarm::LfcPrewarmStateWithProgress;
use crate::http::JsonResponse;
use axum::response::{IntoResponse, Response};
use axum::{Json, http::StatusCode};
use axum_extra::extract::OptionalQuery;
use compute_api::responses::LfcOffloadState;
use compute_api::responses::{LfcOffloadState, LfcPrewarmState};
type Compute = axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>;
pub(in crate::http) async fn prewarm_state(compute: Compute) -> Json<LfcPrewarmStateWithProgress> {
pub(in crate::http) async fn prewarm_state(compute: Compute) -> Json<LfcPrewarmState> {
Json(compute.lfc_prewarm_state().await)
}

View File

@@ -1,11 +1,22 @@
use crate::http::JsonResponse;
use axum::extract::Json;
use compute_api::responses::PromoteConfig;
use http::StatusCode;
pub(in crate::http) async fn promote(
compute: axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>,
Json(cfg): Json<compute_api::responses::PromoteConfig>,
Json(cfg): Json<PromoteConfig>,
) -> axum::response::Response {
// Return early at the cost of extra parsing spec
let pspec = match crate::compute::ParsedSpec::try_from(cfg.spec) {
Ok(p) => p,
Err(e) => return JsonResponse::error(StatusCode::BAD_REQUEST, e),
};
let cfg = PromoteConfig {
spec: pspec.spec,
wal_flush_lsn: cfg.wal_flush_lsn,
};
let state = compute.promote(cfg).await;
if let compute_api::responses::PromoteState::Failed { error: _ } = state {
return JsonResponse::create_response(StatusCode::INTERNAL_SERVER_ERROR, state);

View File

@@ -28,10 +28,7 @@ pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc<ComputeNode>) {
let compute = compute.clone();
let span = tracing::info_span!("lsn_lease_bg_task", %tenant_id, %timeline_id, %lsn);
let runtime = tokio::runtime::Handle::current();
thread::spawn(move || {
let _rt_guard = runtime.enter();
let _entered = span.entered();
if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn) {
// TODO: might need stronger error feedback than logging an warning.

View File

@@ -303,6 +303,13 @@ enum Command {
#[arg(long, required = true, value_delimiter = ',')]
new_sk_set: Vec<NodeId>,
},
/// Abort ongoing safekeeper migration.
TimelineSafekeeperMigrateAbort {
#[arg(long)]
tenant_id: TenantId,
#[arg(long)]
timeline_id: TimelineId,
},
}
#[derive(Parser)]
@@ -1396,6 +1403,17 @@ async fn main() -> anyhow::Result<()> {
)
.await?;
}
Command::TimelineSafekeeperMigrateAbort {
tenant_id,
timeline_id,
} => {
let path =
format!("v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate_abort");
storcon_client
.dispatch::<(), ()>(Method::POST, path, None)
.await?;
}
}
Ok(())

View File

@@ -120,11 +120,6 @@
"value": "host=pageserver port=6400",
"vartype": "string"
},
{
"name": "neon.pageserver_grpc_urls",
"value": "grpc://pageserver:6401/",
"vartype": "string"
},
{
"name": "max_replication_write_lag",
"value": "500MB",

View File

@@ -1,7 +1,6 @@
broker_endpoint='http://storage_broker:50051'
pg_distrib_dir='/usr/local/'
listen_pg_addr='0.0.0.0:6400'
listen_grpc_addr='0.0.0.0:6401'
listen_http_addr='0.0.0.0:9898'
remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }
control_plane_api='http://0.0.0.0:6666' # No storage controller in docker compose, specify a junk address

View File

@@ -1,10 +1,9 @@
//! Structs representing the JSON formats used in the compute_ctl's HTTP API.
use std::fmt::Display;
use chrono::{DateTime, Utc};
use jsonwebtoken::jwk::JwkSet;
use serde::{Deserialize, Serialize, Serializer};
use std::fmt::Display;
use crate::privilege::Privilege;
use crate::spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role};
@@ -49,7 +48,7 @@ pub struct ExtensionInstallResponse {
/// Status of the LFC prewarm process. The same state machine is reused for
/// both autoprewarm (prewarm after compute/Postgres start using the previously
/// stored LFC state) and explicit prewarming via API.
#[derive(Serialize, Default, Debug, Clone, PartialEq)]
#[derive(Serialize, Default, Debug, Clone)]
#[serde(tag = "status", rename_all = "snake_case")]
pub enum LfcPrewarmState {
/// Default value when compute boots up.
@@ -59,7 +58,14 @@ pub enum LfcPrewarmState {
Prewarming,
/// We found requested LFC state in the endpoint storage and
/// completed prewarming successfully.
Completed,
Completed {
total: i32,
prewarmed: i32,
skipped: i32,
state_download_time_ms: u32,
uncompress_time_ms: u32,
prewarm_time_ms: u32,
},
/// Unexpected error happened during prewarming. Note, `Not Found 404`
/// response from the endpoint storage is explicitly excluded here
/// because it can normally happen on the first compute start,
@@ -84,7 +90,7 @@ impl Display for LfcPrewarmState {
match self {
LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"),
LfcPrewarmState::Prewarming => f.write_str("Prewarming"),
LfcPrewarmState::Completed => f.write_str("Completed"),
LfcPrewarmState::Completed { .. } => f.write_str("Completed"),
LfcPrewarmState::Skipped => f.write_str("Skipped"),
LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
LfcPrewarmState::Cancelled => f.write_str("Cancelled"),
@@ -92,26 +98,36 @@ impl Display for LfcPrewarmState {
}
}
#[derive(Serialize, Default, Debug, Clone, PartialEq)]
#[derive(Serialize, Default, Debug, Clone)]
#[serde(tag = "status", rename_all = "snake_case")]
pub enum LfcOffloadState {
#[default]
NotOffloaded,
Offloading,
Completed,
Completed {
state_query_time_ms: u32,
compress_time_ms: u32,
state_upload_time_ms: u32,
},
Failed {
error: String,
},
/// LFC state was empty so it wasn't offloaded
Skipped,
}
#[derive(Serialize, Debug, Clone, PartialEq)]
#[derive(Serialize, Debug, Clone)]
#[serde(tag = "status", rename_all = "snake_case")]
/// Response of /promote
pub enum PromoteState {
NotPromoted,
Completed,
Failed { error: String },
Completed {
lsn_wait_time_ms: u32,
pg_promote_time_ms: u32,
reconfigure_time_ms: u32,
},
Failed {
error: String,
},
}
#[derive(Deserialize, Default, Debug)]

View File

@@ -16,5 +16,5 @@ rustc-hash.workspace = true
tempfile = "3.14.0"
[dev-dependencies]
rand = "0.9"
rand_distr = "0.5.1"
rand.workspace = true
rand_distr = "0.5.1"

View File

@@ -1,330 +0,0 @@
use criterion::{BatchSize, BenchmarkId, Criterion, criterion_group, criterion_main};
use neon_shmem::hash::HashMapAccess;
use neon_shmem::hash::HashMapInit;
use neon_shmem::hash::entry::Entry;
use rand::distr::{Distribution, StandardUniform};
use rand::prelude::*;
use std::default::Default;
use std::hash::BuildHasher;
// Taken from bindings to C code
#[derive(Clone, Debug, Hash, Eq, PartialEq)]
#[repr(C)]
pub struct FileCacheKey {
pub _spc_id: u32,
pub _db_id: u32,
pub _rel_number: u32,
pub _fork_num: u32,
pub _block_num: u32,
}
impl Distribution<FileCacheKey> for StandardUniform {
// questionable, but doesn't need to be good randomness
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> FileCacheKey {
FileCacheKey {
_spc_id: rng.random(),
_db_id: rng.random(),
_rel_number: rng.random(),
_fork_num: rng.random(),
_block_num: rng.random(),
}
}
}
#[derive(Clone, Debug)]
#[repr(C)]
pub struct FileCacheEntry {
pub _offset: u32,
pub _access_count: u32,
pub _prev: *mut FileCacheEntry,
pub _next: *mut FileCacheEntry,
pub _state: [u32; 8],
}
impl FileCacheEntry {
fn dummy() -> Self {
Self {
_offset: 0,
_access_count: 0,
_prev: std::ptr::null_mut(),
_next: std::ptr::null_mut(),
_state: [0; 8],
}
}
}
// Utilities for applying operations.
#[derive(Clone, Debug)]
struct TestOp<K, V>(K, Option<V>);
fn apply_op<K: Clone + std::hash::Hash + Eq, V, S: std::hash::BuildHasher>(
op: TestOp<K, V>,
map: &mut HashMapAccess<K, V, S>,
) {
let entry = map.entry(op.0);
match op.1 {
Some(new) => match entry {
Entry::Occupied(mut e) => Some(e.insert(new)),
Entry::Vacant(e) => {
_ = e.insert(new).unwrap();
None
}
},
None => match entry {
Entry::Occupied(e) => Some(e.remove()),
Entry::Vacant(_) => None,
},
};
}
// Hash utilities
struct SeaRandomState {
k1: u64,
k2: u64,
k3: u64,
k4: u64,
}
impl std::hash::BuildHasher for SeaRandomState {
type Hasher = seahash::SeaHasher;
fn build_hasher(&self) -> Self::Hasher {
seahash::SeaHasher::with_seeds(self.k1, self.k2, self.k3, self.k4)
}
}
impl SeaRandomState {
fn new() -> Self {
let mut rng = rand::rng();
Self {
k1: rng.random(),
k2: rng.random(),
k3: rng.random(),
k4: rng.random(),
}
}
}
fn small_benchs(c: &mut Criterion) {
let mut group = c.benchmark_group("Small maps");
group.sample_size(10);
group.bench_function("small_rehash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2).attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("small_rehash_xxhash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2)
.with_hasher(twox_hash::xxhash64::RandomState::default())
.attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("small_rehash_ahash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2)
.with_hasher(ahash::RandomState::default())
.attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("small_rehash_seahash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2)
.with_hasher(SeaRandomState::new())
.attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.finish();
}
fn real_benchs(c: &mut Criterion) {
let mut group = c.benchmark_group("Realistic workloads");
group.sample_size(10);
group.bench_function("real_bulk_insert", |b| {
let size = 125_000_000;
let ideal_filled = 100_000_000;
let mut rng = rand::rng();
b.iter_batched(
|| HashMapInit::new_resizeable(size, size * 2).attach_writer(),
|writer| {
for _ in 0..ideal_filled {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
let entry = writer.entry(key);
match entry {
Entry::Occupied(mut e) => {
std::hint::black_box(e.insert(val));
}
Entry::Vacant(e) => {
let _ = std::hint::black_box(e.insert(val).unwrap());
}
}
}
},
BatchSize::SmallInput,
)
});
group.bench_function("real_rehash", |b| {
let size = 125_000_000;
let ideal_filled = 100_000_000;
let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("real_rehash_hashbrown", |b| {
let size = 125_000_000;
let ideal_filled = 100_000_000;
let mut writer = hashbrown::raw::RawTable::new();
let mut rng = rand::rng();
let hasher = rustc_hash::FxBuildHasher;
unsafe {
writer
.resize(
size,
|(k, _)| hasher.hash_one(k),
hashbrown::raw::Fallibility::Infallible,
)
.unwrap();
}
while writer.len() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
hasher.hash_one(k)
});
}
b.iter(|| unsafe {
writer.table.rehash_in_place(
&|table, index| {
hasher.hash_one(
&table
.bucket::<(FileCacheKey, FileCacheEntry)>(index)
.as_ref()
.0,
)
},
std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
Some(|ptr| std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry)))
} else {
None
},
)
});
});
for elems in [2, 4, 8, 16, 32, 64, 96, 112] {
group.bench_with_input(
BenchmarkId::new("real_rehash_varied", elems),
&elems,
|b, &size| {
let ideal_filled = size * 1_000_000;
let size = 125_000_000;
let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
},
);
group.bench_with_input(
BenchmarkId::new("real_rehash_varied_hashbrown", elems),
&elems,
|b, &size| {
let ideal_filled = size * 1_000_000;
let size = 125_000_000;
let mut writer = hashbrown::raw::RawTable::new();
let mut rng = rand::rng();
let hasher = rustc_hash::FxBuildHasher;
unsafe {
writer
.resize(
size,
|(k, _)| hasher.hash_one(k),
hashbrown::raw::Fallibility::Infallible,
)
.unwrap();
}
while writer.len() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
hasher.hash_one(k)
});
}
b.iter(|| unsafe {
writer.table.rehash_in_place(
&|table, index| {
hasher.hash_one(
&table
.bucket::<(FileCacheKey, FileCacheEntry)>(index)
.as_ref()
.0,
)
},
std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
Some(|ptr| {
std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry))
})
} else {
None
},
)
});
},
);
}
group.finish();
}
criterion_group!(benches, small_benchs, real_benchs);
criterion_main!(benches);

View File

@@ -16,7 +16,6 @@
//!
//! Concurrency is managed very simply: the entire map is guarded by one shared-memory RwLock.
use std::fmt::Debug;
use std::hash::{BuildHasher, Hash};
use std::mem::MaybeUninit;
@@ -41,75 +40,45 @@ pub enum HashMapShrinkError {
#[error("shmem resize failed: {0}")]
ResizeError(shmem::Error),
/// Occupied entries in to-be-shrunk space were encountered beginning at the given index.
#[error("occupied entry in deallocated space found at {2} (in deallocated range of {0}..{1})")]
RemainingEntries(usize, usize, usize),
#[error("occupied entry in deallocated space found at {0}")]
RemainingEntries(usize),
}
/// This represents a hash table that (possibly) lives in shared memory.
/// If a new process is launched with fork(), the child process inherits
/// this struct.
#[must_use]
pub struct HashMapInit<K: 'static, V: 'static, S = rustc_hash::FxBuildHasher> {
pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> {
shmem_handle: Option<ShmemHandle>,
shared_ptr: *mut HashMapShared<K, V>,
shared_ptr: *mut HashMapShared<'a, K, V>,
shared_size: usize,
hasher: S,
num_buckets: u32,
}
impl<K, V, S> Debug for HashMapInit<K, V, S>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("HashMapInit")
.field("shmem_handle", &self.shmem_handle)
.field("shared_ptr", &self.shared_ptr)
.field("shared_size", &self.shared_size)
// .field("hasher", &self.hasher)
.field("num_buckets", &self.num_buckets)
.finish()
}
}
/// This is a per-process handle to a hash table that (possibly) lives in shared memory.
/// If a child process is launched with fork(), the child process should
/// get its own HashMapAccess by calling HashMapInit::attach_writer/reader().
///
/// XXX: We're not making use of it at the moment, but this struct could
/// hold process-local information in the future.
pub struct HashMapAccess<K: 'static, V: 'static, S = rustc_hash::FxBuildHasher> {
pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> {
shmem_handle: Option<ShmemHandle>,
shared_ptr: *mut HashMapShared<K, V>,
shared_ptr: *mut HashMapShared<'a, K, V>,
hasher: S,
}
unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<K, V, S> {}
unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<K, V, S> {}
unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<'_, K, V, S> {}
unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<'_, K, V, S> {}
impl<K, V, S> Debug for HashMapAccess<K, V, S>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("HashMapAccess")
.field("shmem_handle", &self.shmem_handle)
.field("shared_ptr", &self.shared_ptr)
// .field("hasher", &self.hasher)
.finish()
}
}
impl<K: Clone + Hash + Eq + 'static, V: 'static, S> HashMapInit<K, V, S> {
impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
/// Change the 'hasher' used by the hash table.
///
/// NOTE: This must be called right after creating the hash table,
/// before inserting any entries and before calling attach_writer/reader.
/// Otherwise different accessors could be using different hash function,
/// with confusing results.
pub fn with_hasher<T: BuildHasher>(self, hasher: T) -> HashMapInit<K, V, T> {
pub fn with_hasher<T: BuildHasher>(self, hasher: T) -> HashMapInit<'a, K, V, T> {
HashMapInit {
hasher,
shmem_handle: self.shmem_handle,
@@ -177,7 +146,7 @@ impl<K: Clone + Hash + Eq + 'static, V: 'static, S> HashMapInit<K, V, S> {
}
/// Attach to a hash table for writing.
pub fn attach_writer(self) -> HashMapAccess<K, V, S> {
pub fn attach_writer(self) -> HashMapAccess<'a, K, V, S> {
HashMapAccess {
shmem_handle: self.shmem_handle,
shared_ptr: self.shared_ptr,
@@ -189,7 +158,7 @@ impl<K: Clone + Hash + Eq + 'static, V: 'static, S> HashMapInit<K, V, S> {
///
/// This is a holdover from a previous implementation and is being kept around for
/// backwards compatibility reasons.
pub fn attach_reader(self) -> HashMapAccess<K, V, S> {
pub fn attach_reader(self) -> HashMapAccess<'a, K, V, S> {
self.attach_writer()
}
}
@@ -206,14 +175,14 @@ impl<K: Clone + Hash + Eq + 'static, V: 'static, S> HashMapInit<K, V, S> {
/// dictionary
///
/// In between the above parts, there can be padding bytes to align the parts correctly.
type HashMapShared<K, V> = RwLock<CoreHashMap<K, V>>;
type HashMapShared<'a, K, V> = RwLock<CoreHashMap<'a, K, V>>;
impl<K, V: 'static> HashMapInit<K, V, rustc_hash::FxBuildHasher>
impl<'a, K, V> HashMapInit<'a, K, V, rustc_hash::FxBuildHasher>
where
K: Clone + Hash + Eq + 'static,
K: Clone + Hash + Eq,
{
/// Place the hash table within a user-supplied fixed memory area.
pub fn with_fixed(num_buckets: u32, area: &'static mut [MaybeUninit<u8>]) -> Self {
pub fn with_fixed(num_buckets: u32, area: &'a mut [MaybeUninit<u8>]) -> Self {
Self::new(
num_buckets,
None,
@@ -269,10 +238,9 @@ where
}
}
impl<K, V, S: BuildHasher> HashMapAccess<K, V, S>
impl<'a, K, V, S: BuildHasher> HashMapAccess<'a, K, V, S>
where
K: Clone + Hash + Eq + 'static,
V: 'static
K: Clone + Hash + Eq,
{
/// Hash a key using the map's hasher.
#[inline]
@@ -280,7 +248,7 @@ where
self.hasher.hash_one(key)
}
fn entry_with_hash(&self, key: K, hash: u64) -> Entry<'_, K, V> {
fn entry_with_hash(&self, key: K, hash: u64) -> Entry<'a, '_, K, V> {
let mut map = unsafe { self.shared_ptr.as_ref() }.unwrap().write();
let dict_pos = hash as usize % map.dictionary.len();
let first = map.dictionary[dict_pos];
@@ -330,9 +298,9 @@ where
/// Get a reference to the entry containing a key.
///
/// NB: This takes a write lock as there's no way to distinguish whether the intention
/// NB: THis takes a write lock as there's no way to distinguish whether the intention
/// is to use the entry for reading or for writing in advance.
pub fn entry(&self, key: K) -> Entry<'_, K, V> {
pub fn entry(&self, key: K) -> Entry<'a, '_, K, V> {
let hash = self.get_hash_value(&key);
self.entry_with_hash(key, hash)
}
@@ -366,7 +334,7 @@ where
/// Has more overhead than one would intuitively expect: performs both a clone of the key
/// due to the [`OccupiedEntry`] type owning the key and also a hash of the key in order
/// to enable repairing the hash chain if the entry is removed.
pub fn entry_at_bucket(&self, pos: usize) -> Option<OccupiedEntry<'_, K, V>> {
pub fn entry_at_bucket(&self, pos: usize) -> Option<OccupiedEntry<'a, '_, K, V>> {
let map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
if pos >= map.buckets.len() {
return None;
@@ -390,16 +358,6 @@ where
map.get_num_buckets()
}
/// Returns the logical number of buckets in the table (aka the amount of allocatable buckets).
pub fn get_num_logical_buckets(&self) -> usize {
let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
if map.alloc_limit == INVALID_POS {
map.get_num_buckets()
} else {
map.alloc_limit as usize
}
}
/// Return the key and value stored in bucket with given index. This can be used to
/// iterate through the hash map.
// TODO: An Iterator might be nicer. The communicator's clock algorithm needs to
@@ -441,7 +399,7 @@ where
/// in the process.
fn rehash_dict(
&self,
inner: &mut CoreHashMap<K, V>,
inner: &mut CoreHashMap<'a, K, V>,
buckets_ptr: *mut core::Bucket<K, V>,
end_ptr: *mut u8,
num_buckets: u32,
@@ -552,12 +510,12 @@ where
/// # Panics
/// Panics if called on a map initialized with [`HashMapInit::with_fixed`] or if `num_buckets` is
/// greater than the number of buckets in the map.
pub fn begin_shrink(&self, num_buckets: u32) {
pub fn begin_shrink(&mut self, num_buckets: u32) {
let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
// assert!(
// num_buckets <= map.get_num_buckets() as u32,
// "shrink called with a larger number of buckets"
// );
assert!(
num_buckets <= map.get_num_buckets() as u32,
"shrink called with a larger number of buckets"
);
_ = self
.shmem_handle
.as_ref()
@@ -602,9 +560,7 @@ where
for i in (num_buckets as usize)..map.buckets.len() {
if map.buckets[i].inner.is_some() {
return Err(HashMapShrinkError::RemainingEntries(
num_buckets as usize, map.buckets.len(), i)
);
return Err(HashMapShrinkError::RemainingEntries(i));
}
}

View File

@@ -1,6 +1,5 @@
//! Simple hash table with chaining.
use std::fmt::Debug;
use std::hash::Hash;
use std::mem::MaybeUninit;
@@ -18,25 +17,12 @@ pub(crate) struct Bucket<K, V> {
pub(crate) inner: Option<(K, V)>,
}
impl<K, V> Debug for Bucket<K, V>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Bucket")
.field("next", &self.next)
.field("inner", &self.inner)
.finish()
}
}
/// Core hash table implementation.
pub(crate) struct CoreHashMap<K: 'static, V: 'static> {
pub(crate) struct CoreHashMap<'a, K, V> {
/// Dictionary used to map hashes to bucket indices.
pub(crate) dictionary: &'static mut [u32],
pub(crate) dictionary: &'a mut [u32],
/// Buckets containing key-value pairs.
pub(crate) buckets: &'static mut [Bucket<K, V>],
pub(crate) buckets: &'a mut [Bucket<K, V>],
/// Head of the freelist.
pub(crate) free_head: u32,
/// Maximum index of a bucket allowed to be allocated. [`INVALID_POS`] if no limit.
@@ -45,27 +31,11 @@ pub(crate) struct CoreHashMap<K: 'static, V: 'static> {
pub(crate) buckets_in_use: u32,
}
impl<K, V> Debug for CoreHashMap<K, V>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("CoreHashMap")
.field("dictionary", &self.dictionary)
.field("buckets", &self.buckets)
.field("free_head", &self.free_head)
.field("alloc_limit", &self.alloc_limit)
.field("buckets_in_use", &self.buckets_in_use)
.finish()
}
}
/// Error for when there are no empty buckets left but one is needed.
#[derive(Debug, PartialEq)]
pub struct FullError;
impl<K: Clone + Hash + Eq, V> CoreHashMap<K, V> {
impl<'a, K: Clone + Hash + Eq, V> CoreHashMap<'a, K, V> {
const FILL_FACTOR: f32 = 0.60;
/// Estimate the size of data contained within the the hash map.
@@ -83,8 +53,8 @@ impl<K: Clone + Hash + Eq, V> CoreHashMap<K, V> {
}
pub fn new(
buckets: &'static mut [MaybeUninit<Bucket<K, V>>],
dictionary: &'static mut [MaybeUninit<u32>],
buckets: &'a mut [MaybeUninit<Bucket<K, V>>],
dictionary: &'a mut [MaybeUninit<u32>],
) -> Self {
// Initialize the buckets
for i in 0..buckets.len() {

View File

@@ -6,9 +6,9 @@ use crate::sync::{RwLockWriteGuard, ValueWriteGuard};
use std::hash::Hash;
use std::mem;
pub enum Entry<'b, K: 'static, V: 'static> {
Occupied(OccupiedEntry<'b, K, V>),
Vacant(VacantEntry<'b, K, V>),
pub enum Entry<'a, 'b, K, V> {
Occupied(OccupiedEntry<'a, 'b, K, V>),
Vacant(VacantEntry<'a, 'b, K, V>),
}
/// Enum representing the previous position within a chain.
@@ -22,9 +22,9 @@ pub(crate) enum PrevPos {
Unknown(u64),
}
pub struct OccupiedEntry<'b, K: 'static, V: 'static> {
pub struct OccupiedEntry<'a, 'b, K, V> {
/// Mutable reference to the map containing this entry.
pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<K, V>>,
pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>,
/// The key of the occupied entry
pub(crate) _key: K,
/// The index of the previous entry in the chain.
@@ -33,7 +33,7 @@ pub struct OccupiedEntry<'b, K: 'static, V: 'static> {
pub(crate) bucket_pos: u32,
}
impl<K: 'static, V: 'static> OccupiedEntry<'_, K, V> {
impl<K, V> OccupiedEntry<'_, '_, K, V> {
pub fn get(&self) -> &V {
&self.map.buckets[self.bucket_pos as usize]
.inner
@@ -61,10 +61,6 @@ impl<K: 'static, V: 'static> OccupiedEntry<'_, K, V> {
///
/// This may result in multiple bucket accesses if the entry was obtained by index as the
/// previous chain entry needs to be discovered in this case.
///
/// # Panics
/// Panics if the `prev_pos` field is equal to [`PrevPos::Unknown`]. In practice, this means
/// the entry was obtained via calling something like [`super::HashMapAccess::entry_at_bucket`].
pub fn remove(mut self) -> V {
// If this bucket was queried by index, go ahead and follow its chain from the start.
let prev = if let PrevPos::Unknown(hash) = self.prev_pos {
@@ -108,16 +104,16 @@ impl<K: 'static, V: 'static> OccupiedEntry<'_, K, V> {
}
/// An abstract view into a vacant entry within the map.
pub struct VacantEntry<'b, K: 'static, V: 'static> {
pub struct VacantEntry<'a, 'b, K, V> {
/// Mutable reference to the map containing this entry.
pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<K, V>>,
pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>,
/// The key to be inserted into this entry.
pub(crate) key: K,
/// The position within the dictionary corresponding to the key's hash.
pub(crate) dict_pos: u32,
}
impl<'b, K: Clone + Hash + Eq + 'static, V: 'static> VacantEntry<'b, K, V> {
impl<'b, K: Clone + Hash + Eq, V> VacantEntry<'_, 'b, K, V> {
/// Insert a value into the vacant entry, finding and populating an empty bucket in the process.
///
/// # Errors

View File

@@ -21,7 +21,6 @@ use nix::unistd::ftruncate as nix_ftruncate;
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
/// will cause the file to be expanded, but we might use `mprotect()` etc. to enforce that in the
/// future.
#[derive(Debug)]
pub struct ShmemHandle {
/// memfd file descriptor
fd: OwnedFd,
@@ -36,7 +35,6 @@ pub struct ShmemHandle {
}
/// This is stored at the beginning in the shared memory area.
#[derive(Debug)]
struct SharedStruct {
max_size: usize,

View File

@@ -310,11 +310,6 @@ impl AtomicLsn {
}
}
/// Consumes the atomic and returns the contained value.
pub const fn into_inner(self) -> Lsn {
Lsn(self.inner.into_inner())
}
/// Atomically retrieve the `Lsn` value from memory.
pub fn load(&self) -> Lsn {
Lsn(self.inner.load(Ordering::Acquire))

View File

@@ -54,7 +54,6 @@ pageserver_api.workspace = true
pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
pageserver_compaction.workspace = true
pageserver_page_api.workspace = true
peekable.workspace = true
pem.workspace = true
pin-project-lite.workspace = true
postgres_backend.workspace = true
@@ -67,7 +66,6 @@ postgres-types.workspace = true
posthog_client_lite.workspace = true
pprof.workspace = true
pq_proto.workspace = true
prost.workspace = true
rand.workspace = true
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
regex.workspace = true

View File

@@ -3,4 +3,3 @@ mod pool;
mod retry;
pub use client::{PageserverClient, ShardSpec};
pub use pageserver_api::shard::ShardStripeSize; // used in ShardSpec

View File

@@ -33,8 +33,6 @@ pub enum ProtocolError {
Invalid(&'static str, String),
#[error("required field '{0}' is missing")]
Missing(&'static str),
#[error("invalid combination of not_modified_lsn '{0}' and request_lsn '{1}'")]
InvalidLsns(Lsn, Lsn),
}
impl ProtocolError {
@@ -87,9 +85,9 @@ impl TryFrom<proto::ReadLsn> for ReadLsn {
return Err(ProtocolError::invalid("request_lsn", pb.request_lsn));
}
if pb.not_modified_since_lsn > pb.request_lsn {
return Err(ProtocolError::InvalidLsns(
Lsn(pb.not_modified_since_lsn),
Lsn(pb.request_lsn),
return Err(ProtocolError::invalid(
"not_modified_since_lsn",
pb.not_modified_since_lsn,
));
}
Ok(Self {

View File

@@ -25,9 +25,6 @@ tracing.workspace = true
tokio.workspace = true
tokio-stream.workspace = true
tokio-util.workspace = true
axum.workspace = true
http.workspace = true
metrics.workspace = true
tonic.workspace = true
url.workspace = true

View File

@@ -34,10 +34,6 @@ use crate::util::{request_stats, tokio_thread_local_stats};
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
#[derive(clap::Parser)]
pub(crate) struct Args {
#[clap(long, default_value = "false")]
grpc: bool,
#[clap(long, default_value = "false")]
grpc_stream: bool,
#[clap(long, default_value = "http://localhost:9898")]
mgmt_api_endpoint: String,
/// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
@@ -82,9 +78,6 @@ pub(crate) struct Args {
#[clap(long)]
set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,
#[clap(long)]
only_relnode: Option<u32>,
/// Queue depth generated in each client.
#[clap(long, default_value = "1")]
queue_depth: NonZeroUsize,
@@ -99,31 +92,10 @@ pub(crate) struct Args {
#[clap(long, default_value = "1")]
batch_size: NonZeroUsize,
#[clap(long)]
only_relnode: Option<u32>,
targets: Option<Vec<TenantTimelineId>>,
#[clap(long, default_value = "100")]
pool_max_consumers: NonZeroUsize,
#[clap(long, default_value = "5")]
pool_error_threshold: NonZeroUsize,
#[clap(long, default_value = "5000")]
pool_connect_timeout: NonZeroUsize,
#[clap(long, default_value = "1000")]
pool_connect_backoff: NonZeroUsize,
#[clap(long, default_value = "60000")]
pool_max_idle_duration: NonZeroUsize,
#[clap(long, default_value = "0")]
max_delay_ms: usize,
#[clap(long, default_value = "0")]
percent_drops: usize,
#[clap(long, default_value = "0")]
percent_hangs: usize,
}
/// State shared by all clients
@@ -180,6 +152,7 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
main_impl(args, thread_local_stats)
})
}
async fn main_impl(
args: Args,
all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
@@ -344,7 +317,6 @@ async fn main_impl(
let rps_period = args
.per_client_rate
.map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
let ss = shared_state.clone();
let cancel = cancel.clone();

View File

@@ -453,7 +453,6 @@ impl TimelineHandles {
handles: Default::default(),
}
}
async fn get(
&mut self,
tenant_id: TenantId,

View File

@@ -5,12 +5,10 @@ MODULE_big = neon
OBJS = \
$(WIN32RES) \
communicator.o \
communicator_new.o \
communicator_process.o \
extension_server.o \
file_cache.o \
hll.o \
lfc_prewarm.o \
libpagestore.o \
logical_replication_monitor.o \
neon.o \
@@ -69,7 +67,6 @@ WALPROP_OBJS = \
# libcommunicator.a is built by cargo from the Rust sources under communicator/
# subdirectory. `cargo build` also generates communicator_bindings.h.
communicator_new.o: communicator/communicator_bindings.h
communicator_process.o: communicator/communicator_bindings.h
file_cache.o: communicator/communicator_bindings.h

View File

@@ -17,36 +17,14 @@ rest_broker = []
[dependencies]
axum.workspace = true
bytes.workspace = true
clashmap.workspace = true
http.workspace = true
libc.workspace = true
nix.workspace = true
atomic_enum = "0.3.0"
measured.workspace = true
prometheus.workspace = true
prost.workspace = true
strum_macros.workspace = true
thiserror.workspace = true
tonic = { workspace = true, default-features = false, features=["codegen", "prost", "transport"] }
tokio = { workspace = true, features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] }
tokio-pipe = { version = "0.2.12" }
tracing.workspace = true
tracing-subscriber.workspace = true
uring-common = { workspace = true, features = ["bytes"] }
pageserver_client_grpc.workspace = true
pageserver_api.workspace = true
pageserver_page_api.workspace = true
neon-shmem.workspace = true
measured.workspace = true
utils.workspace = true
workspace_hack = { version = "0.1", path = "../../../workspace_hack" }
[target.'cfg(target_os = "linux")'.dependencies]
fiemap = "0.1.3"
itertools.workspace = true
[build-dependencies]
cbindgen.workspace = true

View File

@@ -3,18 +3,9 @@
This package provides the so-called "compute-pageserver communicator",
or just "communicator" in short. The communicator is a separate
background worker process that runs in the PostgreSQL server. It's
part of the neon extension.
The commuicator handles the communication with the pageservers, and
also provides an HTTP endpoint for metrics over a local Unix Domain
socket (aka. the "communicator control socket"). On the PostgreSQL
side, the glue code in pgxn/neon/ uses the communicator to implement
the PostgreSQL Storage Manager (SMGR) interface.
## Design criteria
- Low latency
- Saturate a 10 Gbit / s network interface without becoming a bottleneck
part of the neon extension. Currently, it only provides an HTTP
endpoint for metrics, but in the future it will evolve to handle all
communications with the pageservers.
## Source code view
@@ -23,122 +14,10 @@ pgxn/neon/communicator_process.c
the glue that interacts with PostgreSQL code and the Rust
code in the communicator process.
pgxn/neon/communicator_new.c
Contains the backend code that interacts with the communicator
process.
pgxn/neon/communicator/src/backend_interface.rs
The entry point for calls from each backend.
pgxn/neon/communicator/src/init.rs
Initialization at server startup
pgxn/neon/communicator/src/worker_process/
Worker process main loop and glue code
At compilation time, pgxn/neon/communicator/ produces a static
library, libcommunicator.a. It is linked to the neon.so extension
library.
The real networking code, which is independent of PostgreSQL, is in
the pageserver/client_grpc crate.
## Process view
The communicator runs in a dedicated background worker process, the
"communicator process". The communicator uses a multi-threaded Tokio
runtime to execute the IO requests. So the communicator process has
multiple threads running. That's unusual for Postgres processes and
care must be taken to make that work.
### Backend <-> worker communication
Each backend has a number of I/O request slots in shared memory. The
slots are statically allocated for each backend, and must not be
accessed by other backends. The worker process reads requests from the
shared memory slots, and writes responses back to the slots.
Here's an example snapshot of the system, when two requests from two
different backends are in progress:
```
Backends Request slots Communicator process
--------- ------------- --------------------
Backend 1 1: Idle
2: Idle
3: Processing tokio task handling request 3
Backend 2 4: Completed
5: Processing tokio task handling request 5
6: Idle
... ...
```
To submit an IO request, the backend first picks one of its Idle
slots, writes the IO request in the slot, and updates it to
'Submitted' state. That transfers the ownership of the slot to the
worker process, until the worker process marks the request as
Completed. The worker process spawns a separate Tokio task for each
request.
To inform the worker process that a request slot has a pending IO
request, there's a pipe shared by the worker process and all backend
processes. The backend writes the index of the request slot to the
pipe after changing the slot's state to Submitted. This wakes up the
worker process.
(Note that the pipe is just used for wakeups, but the worker process
is free to pick up Submitted IO requests even without receiving the
wakeup. As of this writing, it doesn't do that, but it might be useful
in the future to reduce latency even further, for example.)
When the worker process has completed processing the request, it
writes the result back in the request slot. A GetPage request can also
contain a pointer to buffer in the shared buffer cache. In that case,
the worker process writes the resulting page contents directly to the
buffer, and just a result code in the request slot. It then updates
the 'state' field to Completed, which passes the owner ship back to
the originating backend. Finally, it signals the process Latch of the
originating backend, waking it up.
### Differences between PostgreSQL v16, v17 and v18
PostgreSQL v18 introduced the new AIO mechanism. The PostgreSQL AIO
mechanism uses a very similar mechanism as described in the previous
section, for the communication between AIO worker processes and
backends. With our communicator, the AIO worker processes are not
used, but we use the same PgAioHandle request slots as in upstream.
For Neon-specific IO requests like GetDbSize, a neon request slot is
used. But for the actual IO requests, the request slot merely contains
a pointer to the PgAioHandle slot. The worker process updates the
status of that, calls the IO callbacks upon completionetc, just like
the upstream AIO worker processes do.
## Sequence diagram
neon
PostgreSQL extension backend_interface.rs worker_process.rs processor tonic
| . . . .
| smgr_read() . . . .
+-------------> + . . .
. | . . .
. | rcommunicator_ . . .
. | get_page_at_lsn . . .
. +------------------> + . .
| . .
| write request to . . .
| slot . .
| . .
| . .
| submit_request() . .
+-----------------> + .
| | .
| | db_size_request . .
+---------------->.
. TODO
### Compute <-> pageserver protocol
The protocol between Compute and the pageserver is based on gRPC. See `protos/`.

View File

@@ -1,224 +0,0 @@
//! This module implements a request/response "slot" for submitting
//! requests from backends to the communicator process.
//!
//! NB: The "backend" side of this code runs in Postgres backend processes,
//! which means that it is not safe to use the 'tracing' crate for logging, nor
//! to launch threads or use tokio tasks!
use std::cell::UnsafeCell;
use std::sync::atomic::{AtomicI32, Ordering};
use crate::neon_request::{NeonIORequest, NeonIOResult};
use atomic_enum::atomic_enum;
/// One request/response slot. Each backend has its own set of slots that it
/// uses.
///
/// This is the moral equivalent of PgAioHandle for Postgres AIO requests
/// Like PgAioHandle, try to keep this small.
///
/// There is an array of these in shared memory. Therefore, this must be Sized.
///
/// ## Lifecycle of a request
///
/// A slot is always owned by either the backend process or the communicator
/// process, depending on the 'state'. Only the owning process is allowed to
/// read or modify the slot, except for reading the 'state' itself to check who
/// owns it.
///
/// A slot begins in the Idle state, where it is owned by the backend process.
/// To submit a request, the backend process fills the slot with the request
/// data, and changes it to the Submitted state. After changing the state, the
/// slot is owned by the communicator process, and the backend is not allowed
/// to access it until the communicator process marks it as Completed.
///
/// When the communicator process sees that the slot is in Submitted state, it
/// starts to process the request. After processing the request, it stores the
/// result in the slot, and changes the state to Completed. It is now owned by
/// the backend process again, which may now read the result, and reuse the
/// slot for a new request.
///
/// For correctness of the above protocol, we really only need two states:
/// "owned by backend" and "owned by communicator process". But to help with
/// debugging and better assertions, there are a few more states. When the
/// backend starts to fill in the request details in the slot, it first sets the
/// state from Idle to Filling, and when it's done with that, from Filling to
/// Submitted. In the Filling state, the slot is still owned by the
/// backend. Similarly, when the communicator process starts to process a
/// request, it sets it to Processing state first, but the slot is still owned
/// by the communicator process.
///
/// This struct doesn't handle waking up the communicator process when a request
/// has been submitted or when a response is ready. The 'owner_procno' is used
/// for waking up the backend on completion, but that happens elsewhere.
pub struct NeonIORequestSlot {
/// similar to PgAioHandleState
state: AtomicNeonIORequestSlotState,
/// The owning process's ProcNumber. The worker process uses this to set the
/// process's latch on completion.
///
/// (This could be calculated from num_neon_request_slots_per_backend and
/// the index of this slot in the overall 'neon_requst_slots array'. But we
/// prefer the communicator process to not know how the request slots are
/// divided between the backends.)
owner_procno: AtomicI32,
/// SAFETY: This is modified by submit_request(), after it has established
/// ownership of the slot by setting state from Idle to Filling
request: UnsafeCell<NeonIORequest>,
/// Valid when state is Completed
///
/// SAFETY: This is modified by RequestProcessingGuard::complete(). There
/// can be only one RequestProcessingGuard outstanding for a slot at a time,
/// because it is returned by start_processing_request() which checks the
/// state, so RequestProcessingGuard has exclusive access to the slot.
result: UnsafeCell<NeonIOResult>,
}
// The protocol described in the "Lifecycle of a request" section above ensures
// the safe access to the fields
unsafe impl Send for NeonIORequestSlot {}
unsafe impl Sync for NeonIORequestSlot {}
impl Default for NeonIORequestSlot {
fn default() -> NeonIORequestSlot {
NeonIORequestSlot {
owner_procno: AtomicI32::new(-1),
request: UnsafeCell::new(NeonIORequest::Empty),
result: UnsafeCell::new(NeonIOResult::Empty),
state: AtomicNeonIORequestSlotState::new(NeonIORequestSlotState::Idle),
}
}
}
#[atomic_enum]
#[derive(Eq, PartialEq)]
pub enum NeonIORequestSlotState {
Idle,
/// Backend is filling in the request
Filling,
/// Backend has submitted the request to the communicator, but the
/// communicator process has not yet started processing it.
Submitted,
/// Communicator is processing the request
Processing,
/// Communicator has completed the request, and the 'result' field is now
/// valid, but the backend has not read the result yet.
Completed,
}
impl NeonIORequestSlot {
/// Write a request to the slot, and mark it as Submitted.
///
/// Note: This does not wake up the worker process to actually process
/// the request. It's the caller's responsibility to do that.
pub fn submit_request(&self, request: &NeonIORequest, proc_number: i32) {
// Verify that the slot is in Idle state previously, and put it in
// Filling state.
//
// XXX: This step isn't strictly necessary. Assuming the caller didn't
// screw up and try to use a slot that's already in use, we could fill
// the slot and switch it directly from Idle to Submitted state.
if let Err(s) = self.state.compare_exchange(
NeonIORequestSlotState::Idle,
NeonIORequestSlotState::Filling,
Ordering::Relaxed,
Ordering::Relaxed,
) {
panic!("unexpected state in request slot: {s:?}");
}
// Fill in the request details
self.owner_procno.store(proc_number, Ordering::Relaxed);
unsafe { *self.request.get() = *request }
// This synchronizes-with store/swap in [`start_processing_request`].
// Note that this ensures that the previous non-atomic writes visible
// to other threads too.
self.state
.store(NeonIORequestSlotState::Submitted, Ordering::Release);
}
pub fn get_state(&self) -> NeonIORequestSlotState {
self.state.load(Ordering::Relaxed)
}
pub fn try_get_result(&self) -> Option<NeonIOResult> {
// This synchronizes-with the store/swap in [`RequestProcessingGuard::completed`]
let state = self.state.load(Ordering::Acquire);
if state == NeonIORequestSlotState::Completed {
let result = unsafe { *self.result.get() };
self.state
.store(NeonIORequestSlotState::Idle, Ordering::Relaxed);
Some(result)
} else {
None
}
}
/// Read the IO request from the slot indicated in the wakeup
pub fn start_processing_request<'a>(&'a self) -> Option<RequestProcessingGuard<'a>> {
// XXX: using atomic load rather than compare_exchange would be
// sufficient here, as long as the communicator process has _some_ means
// of tracking which requests it's already processing. That could be a
// flag somewhere in communicator's private memory, for example.
//
// This synchronizes-with the store in [`submit_request`].
if let Err(s) = self.state.compare_exchange(
NeonIORequestSlotState::Submitted,
NeonIORequestSlotState::Processing,
Ordering::Acquire,
Ordering::Relaxed,
) {
// FIXME surprising state. This is unexpected at the moment, but if we
// started to process requests more aggressively, without waiting for the
// read from the pipe, then this could happen
panic!("unexpected state in request slot: {s:?}");
}
Some(RequestProcessingGuard(self))
}
}
/// [`NeonIORequestSlot::start_processing_request`] returns this guard object to
/// indicate that the the caller now "owns" the slot, until it calls
/// [`RequestProcessingGuard::completed`].
///
/// TODO: implement Drop on this, to mark the request as Aborted or Errored
/// if [`RequestProcessingGuard::completed`] is not called.
pub struct RequestProcessingGuard<'a>(&'a NeonIORequestSlot);
unsafe impl<'a> Send for RequestProcessingGuard<'a> {}
unsafe impl<'a> Sync for RequestProcessingGuard<'a> {}
impl<'a> RequestProcessingGuard<'a> {
pub fn get_request(&self) -> &NeonIORequest {
unsafe { &*self.0.request.get() }
}
pub fn get_owner_procno(&self) -> i32 {
self.0.owner_procno.load(Ordering::Relaxed)
}
pub fn completed(self, result: NeonIOResult) {
// Store the result to the slot.
unsafe {
*self.0.result.get() = result;
};
// Mark the request as completed. After that, we no longer have
// ownership of the slot, and must not modify it.
let old_state = self
.0
.state
.swap(NeonIORequestSlotState::Completed, Ordering::Release);
assert!(old_state == NeonIORequestSlotState::Processing);
}
}

View File

@@ -1,296 +0,0 @@
//! This code runs in each backend process. That means that launching Rust threads, panicking
//! etc. is forbidden!
use std::os::fd::OwnedFd;
use crate::backend_comms::NeonIORequestSlot;
use crate::init::CommunicatorInitStruct;
use crate::integrated_cache::{BackendCacheReadOp, IntegratedCacheReadAccess};
use crate::neon_request::{CCachedGetPageVResult, CLsn, COid};
use crate::neon_request::{NeonIORequest, NeonIOResult};
use utils::lsn::Lsn;
pub struct CommunicatorBackendStruct<'t> {
my_proc_number: i32,
neon_request_slots: &'t [NeonIORequestSlot],
submission_pipe_write_fd: OwnedFd,
pending_cache_read_op: Option<BackendCacheReadOp<'t>>,
integrated_cache: &'t IntegratedCacheReadAccess<'t>,
}
#[unsafe(no_mangle)]
pub extern "C" fn rcommunicator_backend_init(
cis: Box<CommunicatorInitStruct>,
my_proc_number: i32,
) -> &'static mut CommunicatorBackendStruct<'static> {
if my_proc_number < 0 {
panic!("cannot attach to communicator shared memory with procnumber {my_proc_number}");
}
let integrated_cache = Box::leak(Box::new(cis.integrated_cache_init_struct.backend_init()));
let bs: &'static mut CommunicatorBackendStruct =
Box::leak(Box::new(CommunicatorBackendStruct {
my_proc_number,
neon_request_slots: cis.neon_request_slots,
submission_pipe_write_fd: cis.submission_pipe_write_fd,
pending_cache_read_op: None,
integrated_cache,
}));
bs
}
/// Start a request. You can poll for its completion and get the result by
/// calling bcomm_poll_dbsize_request_completion(). The communicator will wake
/// us up by setting our process latch, so to wait for the completion, wait on
/// the latch and call bcomm_poll_dbsize_request_completion() every time the
/// latch is set.
///
/// Safety: The C caller must ensure that the references are valid.
/// The requested slot must be free, or this panics.
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_start_io_request(
bs: &'_ mut CommunicatorBackendStruct,
slot_idx: i32,
request: &NeonIORequest,
immediate_result_ptr: &mut NeonIOResult,
) -> i32 {
assert!(bs.pending_cache_read_op.is_none());
// Check if the request can be satisfied from the cache first
if let NeonIORequest::RelSize(req) = request {
if let Some(nblocks) = bs.integrated_cache.get_rel_size(&req.reltag()) {
*immediate_result_ptr = NeonIOResult::RelSize(nblocks);
return -1;
}
}
// Create neon request and submit it
bs.start_neon_io_request(slot_idx, request);
slot_idx
}
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_start_get_page_v_request(
bs: &mut CommunicatorBackendStruct,
slot_idx: i32,
request: &NeonIORequest,
immediate_result_ptr: &mut CCachedGetPageVResult,
) -> i32 {
let NeonIORequest::GetPageV(get_pagev_request) = request else {
panic!("invalid request passed to bcomm_start_get_page_v_request()");
};
assert!(matches!(request, NeonIORequest::GetPageV(_)));
assert!(bs.pending_cache_read_op.is_none());
// Check if the request can be satisfied from the cache first
let mut all_cached = true;
let mut read_op = bs.integrated_cache.start_read_op();
for i in 0..get_pagev_request.nblocks {
if let Some(cache_block) = read_op.get_page(
&get_pagev_request.reltag(),
get_pagev_request.block_number + i as u32,
) {
immediate_result_ptr.cache_block_numbers[i as usize] = cache_block;
} else {
// not found in cache
all_cached = false;
break;
}
}
if all_cached {
bs.pending_cache_read_op = Some(read_op);
return -1;
}
// Create neon request and submit it
bs.start_neon_io_request(slot_idx, request);
slot_idx
}
/// Check if a request has completed. Returns:
///
/// -1 if the request is still being processed
/// 0 on success
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_poll_request_completion(
bs: &mut CommunicatorBackendStruct,
request_slot_idx: u32,
result_p: &mut NeonIOResult,
) -> i32 {
match bs.neon_request_slots[request_slot_idx as usize].try_get_result() {
None => -1, // still processing
Some(result) => {
*result_p = result;
0
}
}
}
/// Check if a request has completed. Returns:
///
/// 'false' if the slot is Idle. The backend process has ownership.
/// 'true' if the slot is busy, and should be polled for result.
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_get_request_slot_status(
bs: &mut CommunicatorBackendStruct,
request_slot_idx: u32,
) -> bool {
use crate::backend_comms::NeonIORequestSlotState;
match bs.neon_request_slots[request_slot_idx as usize].get_state() {
NeonIORequestSlotState::Idle => false,
NeonIORequestSlotState::Filling => {
// 'false' would be the right result here. However, this
// is a very transient state. The C code should never
// leave a slot in this state, so if it sees that,
// something's gone wrong and it's not clear what to do
// with it.
panic!("unexpected Filling state in request slot {request_slot_idx}");
}
NeonIORequestSlotState::Submitted => true,
NeonIORequestSlotState::Processing => true,
NeonIORequestSlotState::Completed => true,
}
}
// LFC functions
/// Finish a local file cache read
///
//
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_finish_cache_read(bs: &mut CommunicatorBackendStruct) -> bool {
if let Some(op) = bs.pending_cache_read_op.take() {
op.finish()
} else {
panic!("bcomm_finish_cache_read() called with no cached read pending");
}
}
/// Check if LFC contains the given buffer, and update its last-written LSN if not.
///
/// This is used in WAL replay in read replica, to skip updating pages that are
/// not in cache.
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_update_lw_lsn_for_block_if_not_cached(
bs: &mut CommunicatorBackendStruct,
spc_oid: COid,
db_oid: COid,
rel_number: u32,
fork_number: u8,
block_number: u32,
lsn: CLsn,
) -> bool {
bs.integrated_cache.update_lw_lsn_for_block_if_not_cached(
&pageserver_page_api::RelTag {
spcnode: spc_oid,
dbnode: db_oid,
relnode: rel_number,
forknum: fork_number,
},
block_number,
Lsn(lsn),
)
}
#[repr(C)]
#[derive(Clone, Debug)]
pub struct FileCacheIterator {
next_bucket: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
}
/// Iterate over LFC contents
#[allow(clippy::missing_safety_doc)]
#[unsafe(no_mangle)]
pub unsafe extern "C" fn bcomm_cache_iterate_begin(
_bs: &mut CommunicatorBackendStruct,
iter: *mut FileCacheIterator,
) {
unsafe { (*iter).next_bucket = 0 };
}
#[allow(clippy::missing_safety_doc)]
#[unsafe(no_mangle)]
pub unsafe extern "C" fn bcomm_cache_iterate_next(
bs: &mut CommunicatorBackendStruct,
iter: *mut FileCacheIterator,
) -> bool {
use crate::integrated_cache::GetBucketResult;
loop {
let next_bucket = unsafe { (*iter).next_bucket } as usize;
match bs.integrated_cache.get_bucket(next_bucket) {
GetBucketResult::Occupied(rel, blk) => {
unsafe {
(*iter).spc_oid = rel.spcnode;
(*iter).db_oid = rel.dbnode;
(*iter).rel_number = rel.relnode;
(*iter).fork_number = rel.forknum;
(*iter).block_number = blk;
(*iter).next_bucket += 1;
}
break true;
}
GetBucketResult::Vacant => {
unsafe {
(*iter).next_bucket += 1;
}
continue;
}
GetBucketResult::OutOfBounds => {
break false;
}
}
}
}
#[allow(clippy::missing_safety_doc)]
#[unsafe(no_mangle)]
pub unsafe extern "C" fn bcomm_cache_get_num_pages_used(bs: &mut CommunicatorBackendStruct) -> u64 {
bs.integrated_cache.get_num_buckets_in_use() as u64
}
impl<'t> CommunicatorBackendStruct<'t> {
/// The slot must be free, or this panics.
pub(crate) fn start_neon_io_request(&mut self, request_slot_idx: i32, request: &NeonIORequest) {
let my_proc_number = self.my_proc_number;
self.neon_request_slots[request_slot_idx as usize].submit_request(request, my_proc_number);
// Tell the communicator about it
self.notify_about_request(request_slot_idx);
}
/// Send a wakeup to the communicator process
fn notify_about_request(self: &CommunicatorBackendStruct<'t>, request_slot_idx: i32) {
// wake up communicator by writing the idx to the submission pipe
//
// This can block, if the pipe is full. That should be very rare,
// because the communicator tries hard to drain the pipe to prevent
// that. Also, there's a natural upper bound on how many wakeups can be
// queued up: there is only a limited number of request slots for each
// backend.
//
// If it does block very briefly, that's not too serious.
let idxbuf = request_slot_idx.to_ne_bytes();
let _res = nix::unistd::write(&self.submission_pipe_write_fd, &idxbuf);
// FIXME: check result, return any errors
}
}

View File

@@ -1,299 +0,0 @@
//! Implement the "low-level" parts of the file cache.
//!
//! This module just deals with reading and writing the file, and keeping track
//! which blocks in the cache file are in use and which are free. The "high
//! level" parts of tracking which block in the cache file corresponds to which
//! relation block is handled in 'integrated_cache' instead.
//!
//! This module is only used to access the file from the communicator
//! process. The backend processes *also* read the file (and sometimes also
//! write it? ), but the backends use direct C library calls for that.
use std::fs::File;
use std::os::unix::fs::FileExt;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::sync::Mutex;
use std::sync::atomic::{AtomicU64, Ordering};
use measured::metric;
use measured::metric::MetricEncoding;
use measured::metric::gauge::GaugeState;
use measured::{Gauge, MetricGroup};
use crate::BLCKSZ;
use tokio::task::spawn_blocking;
pub type CacheBlock = u64;
pub const INVALID_CACHE_BLOCK: CacheBlock = u64::MAX;
pub struct FileCache {
file: Arc<File>,
free_list: Mutex<FreeList>,
/// The true number of writable blocks within the LFC file.
///
/// The `max_blocks` field of `FreeList` isn't sufficient once holes are
/// punched, so we track this manually.
pub size: AtomicU64,
/// The path to the LFC file.
///
/// The `fiemap-rs` library doesn't expose any way to issue a FIEMAP ioctl
/// on an existing file descriptor, so we have to save the path.
path: PathBuf,
metrics: FileCacheMetricGroup,
}
#[derive(MetricGroup)]
#[metric(new())]
struct FileCacheMetricGroup {
/// Local File Cache size in 8KiB blocks
max_blocks: Gauge,
/// Number of free 8KiB blocks in Local File Cache
num_free_blocks: Gauge,
}
// TODO: We keep track of all free blocks in this vec. That doesn't really scale.
// Idea: when free_blocks fills up with more than 1024 entries, write them all to
// one block on disk.
#[derive(Debug)]
struct FreeList {
next_free_block: CacheBlock,
max_blocks: u64,
free_blocks: Vec<CacheBlock>,
}
impl FileCache {
pub fn new(file_cache_path: &Path, mut initial_size: u64) -> Result<FileCache, std::io::Error> {
if initial_size < 100 {
tracing::warn!(
"min size for file cache is 100 blocks, {} requested",
initial_size
);
initial_size = 100;
}
let file = std::fs::OpenOptions::new()
.read(true)
.write(true)
.truncate(true)
.create(true)
.open(file_cache_path)?;
Ok(FileCache {
file: Arc::new(file),
size: initial_size.into(),
free_list: Mutex::new(FreeList {
next_free_block: 0,
max_blocks: initial_size,
free_blocks: Vec::new(),
}),
path: file_cache_path.to_path_buf(),
metrics: FileCacheMetricGroup::new(),
})
}
pub async fn read_block(
&self,
cache_block: CacheBlock,
mut dst: impl uring_common::buf::IoBufMut + Send + Sync,
) -> Result<(), std::io::Error> {
assert!(dst.bytes_total() == BLCKSZ);
let file = self.file.clone();
let dst_ref = unsafe { std::slice::from_raw_parts_mut(dst.stable_mut_ptr(), BLCKSZ) };
spawn_blocking(move || file.read_exact_at(dst_ref, cache_block * BLCKSZ as u64)).await??;
Ok(())
}
pub async fn write_block(
&self,
cache_block: CacheBlock,
src: impl uring_common::buf::IoBuf + Send + Sync,
) -> Result<(), std::io::Error> {
assert!(src.bytes_init() == BLCKSZ);
let file = self.file.clone();
let src_ref = unsafe { std::slice::from_raw_parts(src.stable_ptr(), BLCKSZ) };
spawn_blocking(move || file.write_all_at(src_ref, cache_block * BLCKSZ as u64)).await??;
Ok(())
}
/// Allocate a block within the LFC file for use by the LFC.
pub fn alloc_block(&self) -> Option<CacheBlock> {
let mut free_list = self.free_list.lock().unwrap();
if let Some(x) = free_list.free_blocks.pop() {
return Some(x);
}
if free_list.next_free_block < free_list.max_blocks {
let result = free_list.next_free_block;
free_list.next_free_block += 1;
return Some(result);
}
None
}
/// Mark a block used by the LFC as free for allocation.
pub fn dealloc_block(&self, cache_block: CacheBlock) {
let mut free_list = self.free_list.lock().unwrap();
free_list.free_blocks.push(cache_block);
}
/// Attempt to let the filesystem reclaim `num_blocks` of free blocks within the LFC.
/// Returns the number of blocks not reclaimed by the filesystem.
pub fn reclaim_blocks(&self, num_blocks: u64) -> u64 {
let mut free_list = self.free_list.lock().unwrap();
// Try to limit the maximum first so that we can shrink without doing any I/O
let unused_space = num_blocks.min(free_list.max_blocks - free_list.next_free_block);
free_list.max_blocks -= unused_space;
self.size.fetch_sub(unused_space, Ordering::Relaxed);
let punched = (free_list.free_blocks.len() as u64).min(num_blocks - unused_space);
for _ in 0..punched {
self.punch_block(free_list.free_blocks.pop().unwrap());
}
num_blocks - unused_space - punched
}
// "Delete" a block via fallocate's hole punching feature.
// TODO(quantumish): possibly implement some batching? lots of syscalls...
// unfortunately should be at odds with our access pattern as entries in the hashmap
// should have no correlation with the location of blocks in the actual LFC file.
/// "Un-punch" a block by re-allocating it with `fallocate` and update LFC size.
fn unpunch_block(&self, block: CacheBlock) {
use nix::fcntl as nix;
self.size.fetch_add(1, Ordering::Relaxed);
if let Err(e) = nix::fallocate(
self.file.clone(),
nix::FallocateFlags::FALLOC_FL_ZERO_RANGE
.union(nix::FallocateFlags::FALLOC_FL_KEEP_SIZE),
(block as usize * BLCKSZ) as libc::off_t,
BLCKSZ as libc::off_t
) {
tracing::error!("failed to un-punch hole in LFC at {block}: {e}");
return;
}
}
/// "Punch" a block out of the LFC file by using `fallocate` and update LFC size.
pub fn punch_block(&self, block: CacheBlock) {
use nix::fcntl as nix;
self.size.fetch_sub(1, Ordering::Relaxed);
if let Err(e) = nix::fallocate(
self.file.clone(),
nix::FallocateFlags::FALLOC_FL_PUNCH_HOLE
.union(nix::FallocateFlags::FALLOC_FL_KEEP_SIZE),
(block as usize * BLCKSZ) as libc::off_t,
BLCKSZ as libc::off_t
) {
tracing::error!("failed to punch hole in LFC at {block}: {e}");
return;
}
}
/// Attempt to unpunch `num_blocks` of previously hole-punched blocks.
///
/// Similarly named to `unpunch_block` but does not punch a series of blocks in a row.
/// Instead uses FIEMAP ioctl to locate holes in the file and unpunch them!
#[cfg(target_os = "linux")]
pub fn unpunch_blocks(&self, num_blocks: u64) -> u64 {
use itertools::Itertools;
let mut pushed = 0;
let mut free_list = self.free_list.lock().unwrap();
let res = fiemap::fiemap(self.path.as_path()).unwrap();
for (prev, cur) in res.map(|x| x.unwrap()).tuple_windows() {
if (prev.fe_logical + prev.fe_length) < cur.fe_logical {
let mut end = prev.fe_logical + prev.fe_length;
while end < cur.fe_logical {
free_list.free_blocks.push(end / BLCKSZ as u64);
self.unpunch_block(end / BLCKSZ as u64);
pushed += 1;
if pushed == num_blocks {
return 0;
}
end += BLCKSZ as u64;
}
}
}
num_blocks - pushed
}
/// Attempt to unpunch `num_blocks` of previously hole-punched blocks.
///
/// Much more expensive than the Linux variant as each hole must be located
/// by a separate call to `lseek`.
///
/// FIXME: Sometimes this function provides inaccurate counts of the number
/// of holes within a file. Whether we need this function at all is unclear,
/// as seemingly this part of the codebase only targets a system with ext4?
#[cfg(target_os = "macos")]
pub fn unpunch_blocks(&self, num_blocks: u64) -> u64 {
use nix::unistd as nix;
let mut free_list = self.free_list.lock().unwrap();
let num_bytes = (free_list.next_free_block * BLOCKSZ) as i64;
let mut cur_pos = 0;
let mut pushed = 0;
while cur_pos < num_bytes {
let res = nix::lseek(
file.clone(),
cur_pos,
nix::Whence::SeekHole
).unwrap();
if res >= num_bytes {
break;
}
self.unpunch_block(res);
free_list.free_blocks.push(res);
pushed += 1;
if pushed == num_blocks {
return 0;
}
cur_pos = res + BLOCKSZ as i64;
}
num_blocks - pushed
}
/// Physically grows the file and expands the freelist.
pub fn grow(&self, num_blocks: u64) {
let mut free_list = self.free_list.lock().unwrap();
self.size.fetch_add(num_blocks, Ordering::Relaxed);
free_list.max_blocks += num_blocks;
}
/// Returns number of blocks in the remaining space.
pub fn free_space(&self) -> u64 {
let free_list = self.free_list.lock().unwrap();
let slab = free_list.max_blocks - free_list.next_free_block.min(free_list.max_blocks);
let fragments = free_list.free_blocks.len() as u64;
slab + fragments
}
}
impl<T: metric::group::Encoding> MetricGroup<T> for FileCache
where
GaugeState: MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), <T as metric::group::Encoding>::Err> {
// Update the gauges with fresh values first
{
let free_list = self.free_list.lock().unwrap();
self.metrics.max_blocks.set(free_list.max_blocks as i64);
let total_free_blocks: i64 = free_list.free_blocks.len() as i64
+ (free_list.max_blocks as i64 - free_list.next_free_block as i64);
self.metrics.num_free_blocks.set(total_free_blocks);
}
self.metrics.collect_group_into(enc)
}
}

View File

@@ -1,107 +0,0 @@
//! Global allocator, for tracking memory usage of the Rust parts
//!
//! Postgres is designed to handle allocation failure (ie. malloc() returning NULL) gracefully. It
//! rolls backs the transaction and gives the user an "ERROR: out of memory" error. Rust code
//! however panics if an allocation fails. We don't want that to ever happen, because an unhandled
//! panic leads to Postgres crash and restart. Our strategy is to pre-allocate a large enough chunk
//! of memory for use by the Rust code, so that the allocations never fail.
//!
//! To pick the size for the pre-allocated chunk, we have a metric to track the high watermark
//! memory usage of all the Rust allocations in total.
//!
//! TODO:
//!
//! - Currently we just export the metrics. Actual allocations are still just passed through to
//! the system allocator.
//! - Take padding etc. overhead into account
use std::alloc::{GlobalAlloc, Layout, System};
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
use measured::metric;
use measured::metric::MetricEncoding;
use measured::metric::gauge::GaugeState;
use measured::{Gauge, MetricGroup};
pub(crate) struct MyAllocator {
allocations: AtomicU64,
deallocations: AtomicU64,
allocated: AtomicUsize,
high: AtomicUsize,
}
#[derive(MetricGroup)]
#[metric(new())]
struct MyAllocatorMetricGroup {
/// Number of allocations in Rust code
communicator_mem_allocations: Gauge,
/// Number of deallocations in Rust code
communicator_mem_deallocations: Gauge,
/// Bytes currently allocated
communicator_mem_allocated: Gauge,
/// High watermark of allocated bytes
communicator_mem_high: Gauge,
}
unsafe impl GlobalAlloc for MyAllocator {
unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
self.allocations.fetch_add(1, Ordering::Relaxed);
let mut allocated = self.allocated.fetch_add(layout.size(), Ordering::Relaxed);
allocated += layout.size();
self.high.fetch_max(allocated, Ordering::Relaxed);
unsafe { System.alloc(layout) }
}
unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
self.deallocations.fetch_add(1, Ordering::Relaxed);
self.allocated.fetch_sub(layout.size(), Ordering::Relaxed);
unsafe { System.dealloc(ptr, layout) }
}
}
#[global_allocator]
static GLOBAL: MyAllocator = MyAllocator {
allocations: AtomicU64::new(0),
deallocations: AtomicU64::new(0),
allocated: AtomicUsize::new(0),
high: AtomicUsize::new(0),
};
pub(crate) struct MyAllocatorCollector {
metrics: MyAllocatorMetricGroup,
}
impl MyAllocatorCollector {
pub(crate) fn new() -> Self {
Self {
metrics: MyAllocatorMetricGroup::new(),
}
}
}
impl<T: metric::group::Encoding> MetricGroup<T> for MyAllocatorCollector
where
GaugeState: MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), <T as metric::group::Encoding>::Err> {
// Update the gauges with fresh values first
self.metrics
.communicator_mem_allocations
.set(GLOBAL.allocations.load(Ordering::Relaxed) as i64);
self.metrics
.communicator_mem_deallocations
.set(GLOBAL.allocations.load(Ordering::Relaxed) as i64);
self.metrics
.communicator_mem_allocated
.set(GLOBAL.allocated.load(Ordering::Relaxed) as i64);
self.metrics
.communicator_mem_high
.set(GLOBAL.high.load(Ordering::Relaxed) as i64);
self.metrics.collect_group_into(enc)
}
}

View File

@@ -1,166 +0,0 @@
//! Initialization functions. These are executed in the postmaster process,
//! at different stages of server startup.
//!
//!
//! Communicator initialization steps:
//!
//! 1. At postmaster startup, before shared memory is allocated,
//! rcommunicator_shmem_size() is called to get the amount of
//! shared memory that this module needs.
//!
//! 2. Later, after the shared memory has been allocated,
//! rcommunicator_shmem_init() is called to initialize the shmem
//! area.
//!
//! Per process initialization:
//!
//! When a backend process starts up, it calls rcommunicator_backend_init().
//! In the communicator worker process, other functions are called, see
//! `worker_process` module.
use std::ffi::c_int;
use std::mem;
use std::mem::MaybeUninit;
use std::os::fd::OwnedFd;
use crate::backend_comms::NeonIORequestSlot;
use crate::integrated_cache::IntegratedCacheInitStruct;
/// This struct is created in the postmaster process, and inherited to
/// the communicator process and all backend processes through fork()
#[repr(C)]
pub struct CommunicatorInitStruct {
pub submission_pipe_read_fd: OwnedFd,
pub submission_pipe_write_fd: OwnedFd,
// Shared memory data structures
pub num_neon_request_slots: u32,
pub neon_request_slots: &'static [NeonIORequestSlot],
pub integrated_cache_init_struct: IntegratedCacheInitStruct<'static>,
}
impl std::fmt::Debug for CommunicatorInitStruct {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
fmt.debug_struct("CommunicatorInitStruct")
.field("submission_pipe_read_fd", &self.submission_pipe_read_fd)
.field("submission_pipe_write_fd", &self.submission_pipe_write_fd)
.field("num_neon_request_slots", &self.num_neon_request_slots)
.field("neon_request_slots length", &self.neon_request_slots.len())
.finish()
}
}
#[unsafe(no_mangle)]
pub extern "C" fn rcommunicator_shmem_size(num_neon_request_slots: u32) -> u64 {
let mut size = 0;
size += mem::size_of::<NeonIORequestSlot>() * num_neon_request_slots as usize;
// For integrated_cache's Allocator. TODO: make this adjustable
size += IntegratedCacheInitStruct::shmem_size();
size as u64
}
/// Initialize the shared memory segment. Returns a backend-private
/// struct, which will be inherited by backend processes through fork
#[unsafe(no_mangle)]
pub extern "C" fn rcommunicator_shmem_init(
submission_pipe_read_fd: c_int,
submission_pipe_write_fd: c_int,
num_neon_request_slots: u32,
shmem_area_ptr: *mut MaybeUninit<u8>,
shmem_area_len: u64,
initial_file_cache_size: u64,
max_file_cache_size: u64,
) -> &'static mut CommunicatorInitStruct {
let shmem_area: &'static mut [MaybeUninit<u8>] =
unsafe { std::slice::from_raw_parts_mut(shmem_area_ptr, shmem_area_len as usize) };
let (neon_request_slots, remaining_area) =
alloc_array_from_slice::<NeonIORequestSlot>(shmem_area, num_neon_request_slots as usize);
for slot in neon_request_slots.iter_mut() {
slot.write(NeonIORequestSlot::default());
}
// 'neon_request_slots' is initialized now. (MaybeUninit::slice_assume_init_mut() is nightly-only
// as of this writing.)
let neon_request_slots = unsafe {
std::mem::transmute::<&mut [MaybeUninit<NeonIORequestSlot>], &mut [NeonIORequestSlot]>(
neon_request_slots,
)
};
// Give the rest of the area to the integrated cache
let integrated_cache_init_struct = IntegratedCacheInitStruct::shmem_init(
remaining_area,
initial_file_cache_size,
max_file_cache_size,
);
let (submission_pipe_read_fd, submission_pipe_write_fd) = unsafe {
use std::os::fd::FromRawFd;
(
OwnedFd::from_raw_fd(submission_pipe_read_fd),
OwnedFd::from_raw_fd(submission_pipe_write_fd),
)
};
let cis: &'static mut CommunicatorInitStruct = Box::leak(Box::new(CommunicatorInitStruct {
submission_pipe_read_fd,
submission_pipe_write_fd,
num_neon_request_slots,
neon_request_slots,
integrated_cache_init_struct,
}));
cis
}
pub fn alloc_from_slice<T>(
area: &mut [MaybeUninit<u8>],
) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
let layout = std::alloc::Layout::new::<T>();
let area_start = area.as_mut_ptr();
// pad to satisfy alignment requirements
let padding = area_start.align_offset(layout.align());
if padding + layout.size() > area.len() {
panic!("out of memory");
}
let area = &mut area[padding..];
let (result_area, remain) = area.split_at_mut(layout.size());
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
let result = unsafe { result_ptr.as_mut().unwrap() };
(result, remain)
}
pub fn alloc_array_from_slice<T>(
area: &mut [MaybeUninit<u8>],
len: usize,
) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
let layout = std::alloc::Layout::new::<T>();
let area_start = area.as_mut_ptr();
// pad to satisfy alignment requirements
let padding = area_start.align_offset(layout.align());
if padding + layout.size() * len > area.len() {
panic!("out of memory");
}
let area = &mut area[padding..];
let (result_area, remain) = area.split_at_mut(layout.size() * len);
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
(result, remain)
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,29 +1,5 @@
//! Three main parts:
//! - async tokio communicator core, which receives requests and processes them.
//! - Main loop and requests queues, which routes requests from backends to the core
//! - the per-backend glue code, which submits requests
mod backend_comms;
// mark this 'pub', because these functions are called from C code. Otherwise, the compiler
// complains about a bunch of structs and enum variants being unused, because it thinkgs
// the functions that use them are never called. There are some C-callable functions in
// other modules too, but marking this as pub is currently enough to silence the warnings
//
// TODO: perhaps collect *all* the extern "C" functions to one module?
pub mod backend_interface;
mod file_cache;
mod init;
mod integrated_cache;
mod neon_request;
mod worker_process;
mod global_allocator;
/// Name of the Unix Domain Socket that serves the metrics, and other APIs in the
/// future. This is within the Postgres data directory.
const NEON_COMMUNICATOR_SOCKET_NAME: &str = "neon-communicator.socket";
// FIXME: get this from postgres headers somehow
pub const BLCKSZ: usize = 8192;

View File

@@ -1,466 +0,0 @@
// Definitions of some core PostgreSQL datatypes.
/// XLogRecPtr is defined in "access/xlogdefs.h" as:
///
/// ```
/// typedef uint64 XLogRecPtr;
/// ```
/// cbindgen:no-export
pub type XLogRecPtr = u64;
pub type CLsn = XLogRecPtr;
pub type COid = u32;
// This conveniently matches PG_IOV_MAX
pub const MAX_GETPAGEV_PAGES: usize = 32;
pub const INVALID_BLOCK_NUMBER: u32 = u32::MAX;
use std::ffi::CStr;
use pageserver_page_api::{self as page_api, SlruKind};
/// Request from a Postgres backend to the communicator process
#[allow(clippy::large_enum_variant)]
#[repr(C)]
#[derive(Copy, Clone, Debug, strum_macros::EnumDiscriminants)]
#[strum_discriminants(derive(measured::FixedCardinalityLabel))]
pub enum NeonIORequest {
Empty,
// Read requests. These are C-friendly variants of the corresponding structs in
// pageserver_page_api.
RelSize(CRelSizeRequest),
GetPageV(CGetPageVRequest),
ReadSlruSegment(CReadSlruSegmentRequest),
PrefetchV(CPrefetchVRequest),
DbSize(CDbSizeRequest),
/// This is like GetPageV, but bypasses the LFC and allows specifiying the
/// request LSNs directly. For debugging purposes only.
GetPageVUncached(CGetPageVUncachedRequest),
// Write requests. These are needed to keep the relation size cache and LFC up-to-date.
// They are not sent to the pageserver.
WritePage(CWritePageRequest),
RelExtend(CRelExtendRequest),
RelZeroExtend(CRelZeroExtendRequest),
RelCreate(CRelCreateRequest),
RelTruncate(CRelTruncateRequest),
RelUnlink(CRelUnlinkRequest),
// Other requests
UpdateCachedRelSize(CUpdateCachedRelSizeRequest),
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub enum NeonIOResult {
Empty,
/// InvalidBlockNumber == 0xffffffff means "rel does not exist"
RelSize(u32),
/// the result pages are written to the shared memory addresses given in the request
GetPageV,
/// The result is written to the file, path to which is provided
/// in the request. The [`u64`] value here is the number of blocks.
ReadSlruSegment(u64),
/// A prefetch request returns as soon as the request has been received by the communicator.
/// It is processed in the background.
PrefetchVLaunched,
DbSize(u64),
// FIXME design compact error codes. Can't easily pass a string or other dynamic data.
// currently, this is 'errno'
Error(i32),
Aborted,
/// used for all write requests
WriteOK,
}
impl NeonIORequest {
/// All requests include a unique request ID, which can be used to trace the execution
/// of a request all the way to the pageservers. The request ID needs to be unique
/// within the lifetime of the Postgres instance (but not across servers or across
/// restarts of the same server).
pub fn request_id(&self) -> u64 {
use NeonIORequest::*;
match self {
Empty => 0,
RelSize(req) => req.request_id,
GetPageV(req) => req.request_id,
GetPageVUncached(req) => req.request_id,
ReadSlruSegment(req) => req.request_id,
PrefetchV(req) => req.request_id,
DbSize(req) => req.request_id,
WritePage(req) => req.request_id,
RelExtend(req) => req.request_id,
RelZeroExtend(req) => req.request_id,
RelCreate(req) => req.request_id,
RelTruncate(req) => req.request_id,
RelUnlink(req) => req.request_id,
UpdateCachedRelSize(req) => req.request_id,
}
}
}
/// Special quick result to a CGetPageVRequest request, indicating that the
/// the requested pages are present in the local file cache. The backend can
/// read the blocks directly from the given LFC blocks.
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CCachedGetPageVResult {
pub cache_block_numbers: [u64; MAX_GETPAGEV_PAGES],
}
/// ShmemBuf represents a buffer in shared memory.
///
/// SAFETY: The pointer must point to an area in shared memory. The functions allow you to liberally
/// get a mutable pointer to the contents; it is the caller's responsibility to ensure that you
/// don't access a buffer that's you're not allowed to. Inappropriate access to the buffer doesn't
/// violate Rust's safety semantics, but it will mess up and crash Postgres.
///
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct ShmemBuf {
// Pointer to where the result is written or where to read from. Must point into a buffer in shared memory!
pub ptr: *mut u8,
}
unsafe impl Send for ShmemBuf {}
unsafe impl Sync for ShmemBuf {}
unsafe impl uring_common::buf::IoBuf for ShmemBuf {
fn stable_ptr(&self) -> *const u8 {
self.ptr
}
fn bytes_init(&self) -> usize {
crate::BLCKSZ
}
fn bytes_total(&self) -> usize {
crate::BLCKSZ
}
}
unsafe impl uring_common::buf::IoBufMut for ShmemBuf {
fn stable_mut_ptr(&mut self) -> *mut u8 {
self.ptr
}
unsafe fn set_init(&mut self, pos: usize) {
if pos > crate::BLCKSZ {
panic!(
"set_init called past end of buffer, pos {}, buffer size {}",
pos,
crate::BLCKSZ
);
}
}
}
impl ShmemBuf {
pub fn as_mut_ptr(&self) -> *mut u8 {
self.ptr
}
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelSizeRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub allow_missing: bool,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CGetPageVRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u8,
// These fields define where the result is written. Must point into a buffer in shared memory!
pub dest: [ShmemBuf; MAX_GETPAGEV_PAGES],
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CGetPageVUncachedRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u8,
pub request_lsn: CLsn,
pub not_modified_since: CLsn,
// These fields define where the result is written. Must point into a buffer in shared memory!
pub dest: [ShmemBuf; MAX_GETPAGEV_PAGES],
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CReadSlruSegmentRequest {
pub request_id: u64,
pub slru_kind: SlruKind,
pub segment_number: u32,
pub request_lsn: CLsn,
/// Must be a null-terminated C string containing the file path
/// where the communicator will write the SLRU segment.
pub destination_file_path: ShmemBuf,
}
impl CReadSlruSegmentRequest {
/// Returns the file path where the communicator will write the
/// SLRU segment.
pub(crate) fn destination_file_path(&self) -> String {
unsafe { CStr::from_ptr(self.destination_file_path.as_mut_ptr() as *const _) }
.to_string_lossy()
.into_owned()
}
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CPrefetchVRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u8,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CDbSizeRequest {
pub request_id: u64,
pub db_oid: COid,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CWritePageRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub lsn: CLsn,
// `src` defines the new page contents. Must point into a buffer in shared memory!
pub src: ShmemBuf,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelExtendRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub lsn: CLsn,
// `src` defines the new page contents. Must point into a buffer in shared memory!
pub src: ShmemBuf,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelZeroExtendRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u32,
pub lsn: CLsn,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelCreateRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub lsn: CLsn,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelTruncateRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub nblocks: u32,
pub lsn: CLsn,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelUnlinkRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub lsn: CLsn,
}
impl CRelSizeRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CGetPageVRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CGetPageVUncachedRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CPrefetchVRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CWritePageRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelExtendRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelZeroExtendRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelCreateRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelTruncateRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelUnlinkRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CUpdateCachedRelSizeRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub nblocks: u32,
pub lsn: CLsn,
}
impl CUpdateCachedRelSizeRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}

View File

@@ -4,13 +4,10 @@
//!
//! These are called from the communicator threads! Careful what you do, most Postgres
//! functions are not safe to call in that context.
use utils::lsn::Lsn;
#[cfg(not(test))]
unsafe extern "C" {
pub fn notify_proc_unsafe(procno: std::ffi::c_int);
pub fn callback_set_my_latch_unsafe();
pub fn callback_get_request_lsn_unsafe() -> crate::neon_request::CLsn;
pub fn callback_get_lfc_metrics_unsafe() -> LfcMetrics;
}
@@ -19,36 +16,20 @@ unsafe extern "C" {
// package, but the code coverage build still builds these and tries to link with the
// external C code.)
#[cfg(test)]
unsafe fn notify_proc_unsafe(_procno: std::ffi::c_int) {
panic!("not usable in unit tests");
}
#[cfg(test)]
unsafe fn callback_set_my_latch_unsafe() {
panic!("not usable in unit tests");
}
#[cfg(test)]
unsafe fn callback_get_request_lsn_unsafe() -> crate::neon_request::CLsn {
panic!("not usable in unit tests");
}
#[cfg(test)]
unsafe fn callback_get_lfc_metrics_unsafe() -> LfcMetrics {
panic!("not usable in unit tests");
}
// safe wrappers
pub(super) fn notify_proc(procno: std::ffi::c_int) {
unsafe { notify_proc_unsafe(procno) };
}
pub(super) fn callback_set_my_latch() {
unsafe { callback_set_my_latch_unsafe() };
}
pub(super) fn get_request_lsn() -> Lsn {
Lsn(unsafe { callback_get_request_lsn_unsafe() })
}
pub(super) fn callback_get_lfc_metrics() -> LfcMetrics {
unsafe { callback_get_lfc_metrics_unsafe() }
}

View File

@@ -19,105 +19,71 @@ use http::StatusCode;
use http::header::CONTENT_TYPE;
use measured::MetricGroup;
use measured::metric::MetricEncoding;
use measured::metric::gauge::GaugeState;
use measured::metric::group::Encoding;
use measured::text::BufferedTextEncoder;
use std::io::ErrorKind;
use std::sync::Arc;
use tokio::net::UnixListener;
use crate::NEON_COMMUNICATOR_SOCKET_NAME;
use crate::worker_process::lfc_metrics::LfcMetricsCollector;
use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
enum ControlSocketState<'a> {
Full(&'a CommunicatorWorkerProcessStruct<'a>),
Legacy(LegacyControlSocketState),
}
impl CommunicatorWorkerProcessStruct {
/// Launch the listener
pub(crate) async fn launch_control_socket_listener(
&'static self,
) -> Result<(), std::io::Error> {
use axum::routing::get;
let app = Router::new()
.route("/metrics", get(get_metrics))
.route("/autoscaling_metrics", get(get_autoscaling_metrics))
.route("/debug/panic", get(handle_debug_panic))
.with_state(self);
struct LegacyControlSocketState {
pub(crate) lfc_metrics: LfcMetricsCollector,
}
// If the server is restarted, there might be an old socket still
// lying around. Remove it first.
match std::fs::remove_file(NEON_COMMUNICATOR_SOCKET_NAME) {
Ok(()) => {
tracing::warn!("removed stale control socket");
}
Err(e) if e.kind() == ErrorKind::NotFound => {}
Err(e) => {
tracing::error!("could not remove stale control socket: {e:#}");
// Try to proceed anyway. It will likely fail below though.
}
};
// Create the unix domain socket and start listening on it
let listener = UnixListener::bind(NEON_COMMUNICATOR_SOCKET_NAME)?;
tokio::spawn(async {
tracing::info!("control socket listener spawned");
axum::serve(listener, app)
.await
.expect("axum::serve never returns")
});
impl<T> MetricGroup<T> for LegacyControlSocketState
where
T: Encoding,
GaugeState: MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
self.lfc_metrics.collect_group_into(enc)?;
Ok(())
}
}
/// Launch the listener
pub(crate) async fn launch_listener(
worker: Option<&'static CommunicatorWorkerProcessStruct<'static>>,
) -> Result<(), std::io::Error> {
use axum::routing::get;
let state = match worker {
Some(worker) => ControlSocketState::Full(worker),
None => ControlSocketState::Legacy(LegacyControlSocketState {
lfc_metrics: LfcMetricsCollector,
}),
};
let app = Router::new()
.route("/metrics", get(get_metrics))
.route("/autoscaling_metrics", get(get_autoscaling_metrics))
.route("/debug/panic", get(handle_debug_panic))
.route("/debug/dump_cache_map", get(dump_cache_map))
.with_state(Arc::new(state));
// If the server is restarted, there might be an old socket still
// lying around. Remove it first.
match std::fs::remove_file(NEON_COMMUNICATOR_SOCKET_NAME) {
Ok(()) => {
tracing::warn!("removed stale control socket");
}
Err(e) if e.kind() == ErrorKind::NotFound => {}
Err(e) => {
tracing::error!("could not remove stale control socket: {e:#}");
// Try to proceed anyway. It will likely fail below though.
}
};
// Create the unix domain socket and start listening on it
let listener = UnixListener::bind(NEON_COMMUNICATOR_SOCKET_NAME)?;
tokio::spawn(async {
tracing::info!("control socket listener spawned");
axum::serve(listener, app)
.await
.expect("axum::serve never returns")
});
Ok(())
}
/// Expose all Prometheus metrics.
async fn get_metrics(State(state): State<Arc<ControlSocketState<'_>>>) -> Response {
match state.as_ref() {
ControlSocketState::Full(worker) => metrics_to_response(&worker).await,
ControlSocketState::Legacy(legacy) => metrics_to_response(&legacy).await,
}
async fn get_metrics(State(state): State<&CommunicatorWorkerProcessStruct>) -> Response {
tracing::trace!("/metrics requested");
metrics_to_response(&state).await
}
/// Expose Prometheus metrics, for use by the autoscaling agent.
///
/// This is a subset of all the metrics.
async fn get_autoscaling_metrics(State(state): State<Arc<ControlSocketState<'_>>>) -> Response {
match state.as_ref() {
ControlSocketState::Full(worker) => metrics_to_response(&worker.lfc_metrics).await,
ControlSocketState::Legacy(legacy) => metrics_to_response(&legacy.lfc_metrics).await,
}
async fn get_autoscaling_metrics(
State(state): State<&CommunicatorWorkerProcessStruct>,
) -> Response {
tracing::trace!("/metrics requested");
metrics_to_response(&state.lfc_metrics).await
}
async fn handle_debug_panic(State(_state): State<Arc<ControlSocketState<'_>>>) -> Response {
async fn handle_debug_panic(State(_state): State<&CommunicatorWorkerProcessStruct>) -> Response {
panic!("test HTTP handler task panic");
}
@@ -134,23 +100,3 @@ async fn metrics_to_response(metrics: &(dyn MetricGroup<BufferedTextEncoder> + S
.body(Body::from(enc.finish()))
.unwrap()
}
async fn dump_cache_map(State(state): State<Arc<ControlSocketState<'_>>>) -> Response {
match state.as_ref() {
ControlSocketState::Full(worker) => {
let mut buf: Vec<u8> = Vec::new();
worker.cache.dump_map(&mut buf);
Response::builder()
.status(StatusCode::OK)
.header(CONTENT_TYPE, "application/text")
.body(Body::from(buf))
.unwrap()
}
ControlSocketState::Legacy(_) => Response::builder()
.status(StatusCode::NOT_FOUND)
.header(CONTENT_TYPE, "application/text")
.body(Body::from(Vec::new()))
.unwrap(),
}
}

View File

@@ -1,95 +0,0 @@
//! Lock table to ensure that only one IO request is in flight for a given
//! block (or relation or database metadata) at a time
use std::cmp::Eq;
use std::hash::Hash;
use std::sync::Arc;
use tokio::sync::{Mutex, OwnedMutexGuard};
use clashmap::ClashMap;
use clashmap::Entry;
use pageserver_page_api::RelTag;
#[derive(Clone, Eq, Hash, PartialEq)]
pub enum RequestInProgressKey {
Db(u32),
Rel(RelTag),
Block(RelTag, u32),
}
type RequestId = u64;
pub type RequestInProgressTable = MutexHashMap<RequestInProgressKey, RequestId>;
// more primitive locking thingie:
pub struct MutexHashMap<K, V>
where
K: Clone + Eq + Hash,
{
lock_table: ClashMap<K, (V, Arc<Mutex<()>>)>,
}
pub struct MutexHashMapGuard<'a, K, V>
where
K: Clone + Eq + Hash,
{
pub key: K,
map: &'a MutexHashMap<K, V>,
mutex: Arc<Mutex<()>>,
_guard: OwnedMutexGuard<()>,
}
impl<'a, K, V> Drop for MutexHashMapGuard<'a, K, V>
where
K: Clone + Eq + Hash,
{
fn drop(&mut self) {
let (_old_key, old_val) = self.map.lock_table.remove(&self.key).unwrap();
assert!(Arc::ptr_eq(&old_val.1, &self.mutex));
// the guard will be dropped as we return
}
}
impl<K, V> MutexHashMap<K, V>
where
K: Clone + Eq + Hash,
V: std::fmt::Display + Copy,
{
pub fn new() -> MutexHashMap<K, V> {
MutexHashMap {
lock_table: ClashMap::new(),
}
}
pub async fn lock<'a>(&'a self, key: K, val: V) -> MutexHashMapGuard<'a, K, V> {
let my_mutex = Arc::new(Mutex::new(()));
let my_guard = Arc::clone(&my_mutex).lock_owned().await;
loop {
let (request_id, lock) = match self.lock_table.entry(key.clone()) {
Entry::Occupied(e) => {
let e = e.get();
(e.0, Arc::clone(&e.1))
}
Entry::Vacant(e) => {
e.insert((val, Arc::clone(&my_mutex)));
break;
}
};
tracing::info!("waiting for conflicting IO {request_id} to complete");
let _ = lock.lock().await;
tracing::info!("conflicting IO {request_id} completed");
}
MutexHashMapGuard {
key,
map: self,
mutex: my_mutex,
_guard: my_guard,
}
}
}

View File

@@ -1,126 +1,34 @@
use std::collections::HashMap;
use std::os::fd::AsRawFd;
use std::os::fd::OwnedFd;
use std::path::PathBuf;
use std::str::FromStr as _;
use crate::backend_comms::NeonIORequestSlot;
use crate::file_cache::FileCache;
use crate::global_allocator::MyAllocatorCollector;
use crate::init::CommunicatorInitStruct;
use crate::integrated_cache::{CacheResult, IntegratedCacheWriteAccess};
use crate::neon_request::{CGetPageVRequest, CGetPageVUncachedRequest, CPrefetchVRequest};
use crate::neon_request::{INVALID_BLOCK_NUMBER, NeonIORequest, NeonIOResult};
use crate::worker_process::control_socket;
use crate::worker_process::in_progress_ios::{RequestInProgressKey, RequestInProgressTable};
use crate::worker_process::lfc_metrics::LfcMetricsCollector;
use pageserver_client_grpc::{PageserverClient, ShardSpec, ShardStripeSize};
use pageserver_page_api as page_api;
use tokio::io::AsyncReadExt;
use tokio_pipe::PipeRead;
use uring_common::buf::IoBuf;
use measured::MetricGroup;
use measured::metric::MetricEncoding;
use measured::metric::counter::CounterState;
use measured::metric::gauge::GaugeState;
use measured::metric::group::Encoding;
use measured::{Gauge, GaugeVec};
use utils::id::{TenantId, TimelineId};
use super::callbacks::{get_request_lsn, notify_proc};
use tracing::{error, info, info_span, trace};
use utils::lsn::Lsn;
pub struct CommunicatorWorkerProcessStruct<'a> {
/// Tokio runtime that the main loop and any other related tasks runs in.
pub(crate) runtime: tokio::runtime::Runtime,
/// Client to communicate with the pageserver
client: PageserverClient,
/// Request slots that backends use to send IO requests to the communicator.
neon_request_slots: &'a [NeonIORequestSlot],
/// Notification pipe. Backends use this to notify the communicator that a request is waiting to
/// be processed in one of the request slots.
submission_pipe_read_fd: OwnedFd,
/// Locking table for all in-progress IO requests.
in_progress_table: RequestInProgressTable,
/// Local File Cache, relation size tracking, last-written LSN tracking
pub(crate) cache: IntegratedCacheWriteAccess<'a>,
pub struct CommunicatorWorkerProcessStruct {
runtime: tokio::runtime::Runtime,
/*** Metrics ***/
pub(crate) lfc_metrics: LfcMetricsCollector,
request_counters: GaugeVec<RequestTypeLabelGroupSet>,
getpage_cache_misses_counter: Gauge,
getpage_cache_hits_counter: Gauge,
// For the requests that affect multiple blocks, have separate counters for the # of blocks affected
request_nblocks_counters: GaugeVec<RequestTypeLabelGroupSet>,
allocator_metrics: MyAllocatorCollector,
}
// Define a label group, consisting of 1 or more label values
#[derive(measured::LabelGroup)]
#[label(set = RequestTypeLabelGroupSet)]
struct RequestTypeLabelGroup {
request_type: crate::neon_request::NeonIORequestDiscriminants,
}
impl RequestTypeLabelGroup {
fn from_req(req: &NeonIORequest) -> Self {
RequestTypeLabelGroup {
request_type: req.into(),
}
}
}
/// Launch the communicator process's Rust subsystems
#[allow(clippy::too_many_arguments)]
pub(super) fn init_legacy() -> Result<(), String> {
let runtime = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.thread_name("communicator thread")
.build()
.unwrap();
// Start the listener on the control socket
runtime
.block_on(control_socket::launch_listener(None))
.map_err(|e| e.to_string())?;
Box::leak(Box::new(runtime));
Ok(())
}
/// Launch the communicator process's Rust subsystems
#[allow(clippy::too_many_arguments)]
pub(super) fn init(
cis: CommunicatorInitStruct,
tenant_id: &str,
timeline_id: &str,
auth_token: Option<&str>,
shard_map: HashMap<utils::shard::ShardIndex, String>,
stripe_size: Option<ShardStripeSize>,
initial_file_cache_size: u64,
file_cache_path: Option<PathBuf>,
) -> Result<&'static CommunicatorWorkerProcessStruct<'static>, String> {
tenant_id: Option<&str>,
timeline_id: Option<&str>,
) -> Result<&'static CommunicatorWorkerProcessStruct, String> {
// The caller validated these already
let tenant_id = TenantId::from_str(tenant_id).map_err(|e| format!("invalid tenant ID: {e}"))?;
let timeline_id =
TimelineId::from_str(timeline_id).map_err(|e| format!("invalid timeline ID: {e}"))?;
let shard_spec =
ShardSpec::new(shard_map, stripe_size).map_err(|e| format!("invalid shard spec: {e}:"))?;
let _tenant_id = tenant_id
.map(TenantId::from_str)
.transpose()
.map_err(|e| format!("invalid tenant ID: {e}"))?;
let _timeline_id = timeline_id
.map(TimelineId::from_str)
.transpose()
.map_err(|e| format!("invalid timeline ID: {e}"))?;
let runtime = tokio::runtime::Builder::new_multi_thread()
.enable_all()
@@ -128,716 +36,31 @@ pub(super) fn init(
.build()
.unwrap();
let last_lsn = get_request_lsn();
let file_cache = if let Some(path) = file_cache_path {
Some(FileCache::new(&path, initial_file_cache_size).expect("could not create cache file"))
} else {
// FIXME: temporarily for testing, use LFC even if disabled
Some(
FileCache::new(&PathBuf::from("new_filecache"), 1000)
.expect("could not create cache file"),
)
};
// Initialize subsystems
let cache = cis
.integrated_cache_init_struct
.worker_process_init(last_lsn, file_cache);
let client = {
let _guard = runtime.enter();
PageserverClient::new(
tenant_id,
timeline_id,
shard_spec,
auth_token.map(|s| s.to_string()),
None,
)
.expect("could not create client")
};
let worker_struct = CommunicatorWorkerProcessStruct {
// Note: it's important to not drop the runtime, or all the tasks are dropped
// too. Including it in the returned struct is one way to keep it around.
runtime,
neon_request_slots: cis.neon_request_slots,
client,
cache,
submission_pipe_read_fd: cis.submission_pipe_read_fd,
in_progress_table: RequestInProgressTable::new(),
// metrics
lfc_metrics: LfcMetricsCollector,
request_counters: GaugeVec::new(),
getpage_cache_misses_counter: Gauge::new(),
getpage_cache_hits_counter: Gauge::new(),
request_nblocks_counters: GaugeVec::new(),
allocator_metrics: MyAllocatorCollector::new(),
};
let worker_struct = Box::leak(Box::new(worker_struct));
let main_loop_handle = worker_struct.runtime.spawn(worker_struct.run());
worker_struct.runtime.spawn(async {
let err = main_loop_handle.await.unwrap_err();
error!("error: {err:?}");
});
// Start the listener on the control socket
worker_struct
.runtime
.block_on(control_socket::launch_listener(Some(worker_struct)))
.block_on(worker_struct.launch_control_socket_listener())
.map_err(|e| e.to_string())?;
Ok(worker_struct)
}
impl<'t> CommunicatorWorkerProcessStruct<'t> {
/// Update the configuration
pub(super) fn update_shard_map(
&self,
new_shard_map: HashMap<utils::shard::ShardIndex, String>,
stripe_size: Option<ShardStripeSize>,
) {
let shard_spec = ShardSpec::new(new_shard_map, stripe_size).expect("invalid shard spec");
{
let _in_runtime = self.runtime.enter();
if let Err(err) = self.client.update_shards(shard_spec) {
tracing::error!("could not update shard map: {err:?}");
}
}
}
/// Main loop of the worker process. Receive requests from the backends and process them.
pub(super) async fn run(&'static self) {
let mut idxbuf: [u8; 4] = [0; 4];
let mut submission_pipe_read =
PipeRead::try_from(self.submission_pipe_read_fd.as_raw_fd()).expect("invalid pipe fd");
loop {
// Wait for a backend to ring the doorbell
match submission_pipe_read.read(&mut idxbuf).await {
Ok(4) => {}
Ok(nbytes) => panic!("short read ({nbytes} bytes) on communicator pipe"),
Err(e) => panic!("error reading from communicator pipe: {e}"),
}
let slot_idx = u32::from_ne_bytes(idxbuf) as usize;
// Read the IO request from the slot indicated in the wakeup
let Some(slot) = self.neon_request_slots[slot_idx].start_processing_request() else {
// This currently should not happen. But if we had multiple threads picking up
// requests, and without waiting for the notifications, it could.
panic!("no request in slot");
};
// Ok, we have ownership of this request now. We must process it now, there's no going
// back.
//
// Spawn a separate task for every request. That's a little excessive for requests that
// can be quickly satisfied from the cache, but we expect that to be rare, because the
// requesting backend would have already checked the cache.
tokio::spawn(async move {
use tracing::Instrument;
let request_id = slot.get_request().request_id();
let owner_procno = slot.get_owner_procno();
let span = info_span!(
"processing",
request_id = request_id,
slot_idx = slot_idx,
procno = owner_procno,
);
async {
// FIXME: as a temporary hack, abort the request if we don't get a response
// promptly.
//
// Lots of regression tests are getting stuck and failing at the moment,
// this makes them fail a little faster, which it faster to iterate.
// This needs to be removed once more regression tests are passing.
// See also similar hack in the backend code, in wait_request_completion()
let result = tokio::time::timeout(
tokio::time::Duration::from_secs(60),
self.handle_request(slot.get_request()),
)
.await
.unwrap_or_else(|_elapsed| {
info!("request {request_id} timed out");
NeonIOResult::Error(libc::ETIMEDOUT)
});
trace!("request {request_id} at slot {slot_idx} completed");
// Ok, we have completed the IO. Mark the request as completed. After that,
// we no longer have ownership of the slot, and must not modify it.
slot.completed(result);
// Notify the backend about the completion. (Note that the backend might see
// the completed status even before this; this is just a wakeup)
notify_proc(owner_procno);
}
.instrument(span)
.await
});
}
}
/// Compute the 'request_lsn' to use for a pageserver request
fn request_lsns(&self, not_modified_since_lsn: Lsn) -> page_api::ReadLsn {
let mut request_lsn = get_request_lsn();
// Is it possible that the last-written LSN is ahead of last flush LSN? Generally not, we
// shouldn't evict a page from the buffer cache before all its modifications have been
// safely flushed. That's the "WAL before data" rule. However, there are a few exceptions:
//
// - when creation an index: _bt_blwritepage logs the full page without flushing WAL before
// smgrextend (files are fsynced before build ends).
//
// XXX: If we make a request LSN greater than the current WAL flush LSN, the pageserver would
// block waiting for the WAL arrive, until we flush it and it propagates through the
// safekeepers to the pageserver. If there's nothing that forces the WAL to be flushed,
// the pageserver would get stuck waiting forever. To avoid that, all the write-
// functions in communicator_new.c call XLogSetAsyncXactLSN(). That nudges the WAL writer to
// perform the flush relatively soon.
//
// It would perhaps be nicer to do the WAL flush here, but it's tricky to call back into
// Postgres code to do that from here. That's why we rely on communicator_new.c to do the
// calls "pre-emptively".
//
// FIXME: Because of the above, it can still happen that the flush LSN is ahead of
// not_modified_since, if the WAL writer hasn't done the flush yet. It would be nice to know
// if there are other cases like that that we have mised, but unfortunately we cannot turn
// this into an assertion because of that legit case.
//
// See also the old logic in neon_get_request_lsns() C function
if not_modified_since_lsn > request_lsn {
tracing::info!(
"not_modified_since_lsn {} is ahead of last flushed LSN {}",
not_modified_since_lsn,
request_lsn
);
request_lsn = not_modified_since_lsn;
}
page_api::ReadLsn {
request_lsn,
not_modified_since_lsn: Some(not_modified_since_lsn),
}
}
/// Handle one IO request
async fn handle_request(&'static self, request: &'_ NeonIORequest) -> NeonIOResult {
self.request_counters
.inc(RequestTypeLabelGroup::from_req(request));
match request {
NeonIORequest::Empty => {
error!("unexpected Empty IO request");
NeonIOResult::Error(0)
}
NeonIORequest::RelSize(req) => {
let rel = req.reltag();
let _in_progress_guard = self
.in_progress_table
.lock(RequestInProgressKey::Rel(rel), req.request_id)
.await;
// Check the cache first
let not_modified_since = match self.cache.get_rel_size(&rel) {
CacheResult::Found(nblocks) => {
tracing::trace!("found relsize for {:?} in cache: {}", rel, nblocks);
return NeonIOResult::RelSize(nblocks);
}
// XXX: we don't cache negative entries, so if there's no entry in the cache, it could mean
// that the relation doesn't exist or that we don't have it cached.
CacheResult::NotFound(lsn) => lsn,
};
let read_lsn = self.request_lsns(not_modified_since);
match self
.client
.get_rel_size(page_api::GetRelSizeRequest {
read_lsn,
rel,
allow_missing: req.allow_missing,
})
.await
{
Ok(Some(nblocks)) => {
// update the cache
tracing::trace!(
"updated relsize for {:?} in cache: {}, lsn {}",
rel,
nblocks,
read_lsn
);
self.cache
.remember_rel_size(&rel, nblocks, not_modified_since);
NeonIOResult::RelSize(nblocks)
}
Ok(None) => {
// TODO: cache negative entry?
NeonIOResult::RelSize(INVALID_BLOCK_NUMBER)
}
Err(err) => {
// FIXME: Could we map the tonic StatusCode to a libc errno in a more fine-grained way? Or pass the error message to the backend
info!("tonic error: {err:?}");
NeonIOResult::Error(libc::EIO)
}
}
}
NeonIORequest::GetPageV(req) => match self.handle_get_pagev_request(req).await {
Ok(()) => NeonIOResult::GetPageV,
Err(errno) => NeonIOResult::Error(errno),
},
NeonIORequest::GetPageVUncached(req) => {
match self.handle_get_pagev_uncached_request(req).await {
Ok(()) => NeonIOResult::GetPageV,
Err(errno) => NeonIOResult::Error(errno),
}
}
NeonIORequest::ReadSlruSegment(req) => {
let lsn = Lsn(req.request_lsn);
let file_path = req.destination_file_path();
match self
.client
.get_slru_segment(page_api::GetSlruSegmentRequest {
read_lsn: self.request_lsns(lsn),
kind: req.slru_kind,
segno: req.segment_number,
})
.await
{
Ok(slru_bytes) => {
if let Err(e) = tokio::fs::write(&file_path, &slru_bytes).await {
error!("could not write slru segment to file {file_path}: {e}");
return NeonIOResult::Error(e.raw_os_error().unwrap_or(libc::EIO));
}
let blocks_count = slru_bytes.len() / crate::BLCKSZ;
NeonIOResult::ReadSlruSegment(blocks_count as _)
}
Err(err) => {
// FIXME: Could we map the tonic StatusCode to a libc errno in a more fine-grained way? Or pass the error message to the backend
info!("tonic error: {err:?}");
NeonIOResult::Error(libc::EIO)
}
}
}
NeonIORequest::PrefetchV(req) => {
self.request_nblocks_counters
.inc_by(RequestTypeLabelGroup::from_req(request), req.nblocks as i64);
let req = *req;
// FIXME: handle_request() runs in a separate task already, do we really need to spawn a new one here?
tokio::spawn(async move { self.handle_prefetchv_request(&req).await });
NeonIOResult::PrefetchVLaunched
}
NeonIORequest::DbSize(req) => {
let _in_progress_guard = self
.in_progress_table
.lock(RequestInProgressKey::Db(req.db_oid), req.request_id)
.await;
// Check the cache first
let not_modified_since = match self.cache.get_db_size(req.db_oid) {
CacheResult::Found(db_size) => {
// get_page already copied the block content to the destination
return NeonIOResult::DbSize(db_size);
}
CacheResult::NotFound(lsn) => lsn,
};
match self
.client
.get_db_size(page_api::GetDbSizeRequest {
read_lsn: self.request_lsns(not_modified_since),
db_oid: req.db_oid,
})
.await
{
Ok(db_size) => NeonIOResult::DbSize(db_size),
Err(err) => {
// FIXME: Could we map the tonic StatusCode to a libc errno in a more fine-grained way? Or pass the error message to the backend
info!("tonic error: {err:?}");
NeonIOResult::Error(libc::EIO)
}
}
}
// Write requests
NeonIORequest::WritePage(req) => {
let rel = req.reltag();
let _in_progress_guard = self
.in_progress_table
.lock(
RequestInProgressKey::Block(rel, req.block_number),
req.request_id,
)
.await;
// We must at least update the last-written LSN on the page, but also store the page
// image in the LFC while we still have it
self.cache
.remember_page(&rel, req.block_number, req.src, Lsn(req.lsn), true)
.await;
NeonIOResult::WriteOK
}
NeonIORequest::RelExtend(req) => {
let rel = req.reltag();
let _in_progress_guard = self
.in_progress_table
.lock(
RequestInProgressKey::Block(rel, req.block_number),
req.request_id,
)
.await;
// We must at least update the last-written LSN on the page and the relation size,
// but also store the page image in the LFC while we still have it
self.cache
.remember_page(&rel, req.block_number, req.src, Lsn(req.lsn), true)
.await;
self.cache
.remember_rel_size(&req.reltag(), req.block_number + 1, Lsn(req.lsn));
NeonIOResult::WriteOK
}
NeonIORequest::RelZeroExtend(req) => {
self.request_nblocks_counters
.inc_by(RequestTypeLabelGroup::from_req(request), req.nblocks as i64);
// TODO: need to grab an io-in-progress lock for this? I guess not
// TODO: We could put the empty pages to the cache. Maybe have
// a marker on the block entries for all-zero pages, instead of
// actually storing the empty pages.
self.cache.remember_rel_size(
&req.reltag(),
req.block_number + req.nblocks,
Lsn(req.lsn),
);
NeonIOResult::WriteOK
}
NeonIORequest::RelCreate(req) => {
// TODO: need to grab an io-in-progress lock for this? I guess not
self.cache.remember_rel_size(&req.reltag(), 0, Lsn(req.lsn));
NeonIOResult::WriteOK
}
NeonIORequest::RelTruncate(req) => {
// TODO: need to grab an io-in-progress lock for this? I guess not
self.cache
.remember_rel_size(&req.reltag(), req.nblocks, Lsn(req.lsn));
NeonIOResult::WriteOK
}
NeonIORequest::RelUnlink(req) => {
// TODO: need to grab an io-in-progress lock for this? I guess not
self.cache.forget_rel(&req.reltag(), None, Lsn(req.lsn));
NeonIOResult::WriteOK
}
NeonIORequest::UpdateCachedRelSize(req) => {
// TODO: need to grab an io-in-progress lock for this? I guess not
self.cache
.remember_rel_size(&req.reltag(), req.nblocks, Lsn(req.lsn));
NeonIOResult::WriteOK
}
}
}
/// Subroutine to handle a GetPageV request, since it's a little more complicated than
/// others.
async fn handle_get_pagev_request(&'t self, req: &CGetPageVRequest) -> Result<(), i32> {
let rel = req.reltag();
// Check the cache first
//
// Note: Because the backends perform a direct lookup in the cache before sending
// the request to the communicator process, we expect the pages to almost never
// be already in cache. It could happen if:
// 1. two backends try to read the same page at the same time, but that should never
// happen because there's higher level locking in the Postgres buffer manager, or
// 2. a prefetch request finished at the same time as a backend requested the
// page. That's much more likely.
let mut cache_misses = Vec::with_capacity(req.nblocks as usize);
for i in 0..req.nblocks {
let blkno = req.block_number + i as u32;
// note: this is deadlock-safe even though we hold multiple locks at the same time,
// because they're always acquired in the same order.
let in_progress_guard = self
.in_progress_table
.lock(RequestInProgressKey::Block(rel, blkno), req.request_id)
.await;
let dest = req.dest[i as usize];
let not_modified_since = match self.cache.get_page(&rel, blkno, dest).await {
Ok(CacheResult::Found(_)) => {
// get_page already copied the block content to the destination
trace!("found blk {} in rel {:?} in LFC", blkno, rel);
continue;
}
Ok(CacheResult::NotFound(lsn)) => lsn,
Err(_io_error) => return Err(libc::EIO), // FIXME print the error?
};
cache_misses.push((blkno, not_modified_since, dest, in_progress_guard));
}
self.getpage_cache_misses_counter
.inc_by(cache_misses.len() as i64);
self.getpage_cache_hits_counter
.inc_by(req.nblocks as i64 - cache_misses.len() as i64);
if cache_misses.is_empty() {
return Ok(());
}
let not_modified_since = cache_misses
.iter()
.map(|(_blkno, lsn, _dest, _guard)| *lsn)
.max()
.unwrap();
// Construct a pageserver request for the cache misses
let block_numbers: Vec<u32> = cache_misses
.iter()
.map(|(blkno, _lsn, _dest, _guard)| *blkno)
.collect();
let read_lsn = self.request_lsns(not_modified_since);
trace!(
"sending getpage request for blocks {:?} in rel {:?} lsns {}",
block_numbers, rel, read_lsn
);
match self
.client
.get_page(page_api::GetPageRequest {
request_id: req.request_id.into(),
request_class: page_api::GetPageClass::Normal,
read_lsn,
rel,
block_numbers: block_numbers.clone(),
})
.await
{
Ok(resp) => {
// Write the received page images directly to the shared memory location
// that the backend requested.
if resp.pages.len() != block_numbers.len() {
error!(
"received unexpected response with {} page images from pageserver for a request for {} pages",
resp.pages.len(),
block_numbers.len(),
);
return Err(libc::EIO);
}
trace!(
"received getpage response for blocks {:?} in rel {:?} lsns {}",
block_numbers, rel, read_lsn
);
for (page, (blkno, _lsn, dest, _guard)) in resp.pages.into_iter().zip(cache_misses)
{
let src: &[u8] = page.image.as_ref();
let len = std::cmp::min(src.len(), dest.bytes_total());
unsafe {
std::ptr::copy_nonoverlapping(src.as_ptr(), dest.as_mut_ptr(), len);
};
// Also store it in the LFC while we have it
self.cache
.remember_page(
&rel,
blkno,
page.image,
read_lsn.not_modified_since_lsn.unwrap(),
false,
)
.await;
}
}
Err(err) => {
// FIXME: Could we map the tonic StatusCode to a libc errno in a more fine-grained way? Or pass the error message to the backend
info!("tonic error: {err:?}");
return Err(libc::EIO);
}
}
Ok(())
}
/// Subroutine to handle an GetPageVUncached request.
///
/// Note: this bypasses the cache, in-progress IO locking, and all other side-effects.
/// This request type is only used in tests.
async fn handle_get_pagev_uncached_request(
&'t self,
req: &CGetPageVUncachedRequest,
) -> Result<(), i32> {
let rel = req.reltag();
// Construct a pageserver request
let block_numbers: Vec<u32> =
(req.block_number..(req.block_number + (req.nblocks as u32))).collect();
let read_lsn = page_api::ReadLsn {
request_lsn: Lsn(req.request_lsn),
not_modified_since_lsn: Some(Lsn(req.not_modified_since)),
};
trace!(
"sending (uncached) getpage request for blocks {:?} in rel {:?} lsns {}",
block_numbers, rel, read_lsn
);
match self
.client
.get_page(page_api::GetPageRequest {
request_id: req.request_id.into(),
request_class: page_api::GetPageClass::Normal,
read_lsn,
rel,
block_numbers: block_numbers.clone(),
})
.await
{
Ok(resp) => {
// Write the received page images directly to the shared memory location
// that the backend requested.
if resp.pages.len() != block_numbers.len() {
error!(
"received unexpected response with {} page images from pageserver for a request for {} pages",
resp.pages.len(),
block_numbers.len(),
);
return Err(libc::EIO);
}
trace!(
"received getpage response for blocks {:?} in rel {:?} lsns {}",
block_numbers, rel, read_lsn
);
for (page, dest) in resp.pages.into_iter().zip(req.dest) {
let src: &[u8] = page.image.as_ref();
let len = std::cmp::min(src.len(), dest.bytes_total());
unsafe {
std::ptr::copy_nonoverlapping(src.as_ptr(), dest.as_mut_ptr(), len);
};
}
}
Err(err) => {
// FIXME: Could we map the tonic StatusCode to a libc errno in a more fine-grained way? Or pass the error message to the backend
info!("tonic error: {err:?}");
return Err(libc::EIO);
}
}
Ok(())
}
/// Subroutine to handle a PrefetchV request, since it's a little more complicated than
/// others.
///
/// This is very similar to a GetPageV request, but the results are only stored in the cache.
async fn handle_prefetchv_request(&'static self, req: &CPrefetchVRequest) -> Result<(), i32> {
let rel = req.reltag();
// Check the cache first
let mut cache_misses = Vec::with_capacity(req.nblocks as usize);
for i in 0..req.nblocks {
let blkno = req.block_number + i as u32;
// note: this is deadlock-safe even though we hold multiple locks at the same time,
// because they're always acquired in the same order.
let in_progress_guard = self
.in_progress_table
.lock(RequestInProgressKey::Block(rel, blkno), req.request_id)
.await;
let not_modified_since = match self.cache.page_is_cached(&rel, blkno).await {
Ok(CacheResult::Found(_)) => {
trace!("found blk {} in rel {:?} in LFC", blkno, rel);
continue;
}
Ok(CacheResult::NotFound(lsn)) => lsn,
Err(_io_error) => return Err(libc::EIO), // FIXME print the error?
};
cache_misses.push((blkno, not_modified_since, in_progress_guard));
}
if cache_misses.is_empty() {
return Ok(());
}
let not_modified_since = cache_misses
.iter()
.map(|(_blkno, lsn, _guard)| *lsn)
.max()
.unwrap();
let block_numbers: Vec<u32> = cache_misses
.iter()
.map(|(blkno, _lsn, _guard)| *blkno)
.collect();
// TODO: spawn separate tasks for these. Use the integrated cache to keep track of the
// in-flight requests
match self
.client
.get_page(page_api::GetPageRequest {
request_id: req.request_id.into(),
request_class: page_api::GetPageClass::Prefetch,
read_lsn: self.request_lsns(not_modified_since),
rel,
block_numbers: block_numbers.clone(),
})
.await
{
Ok(resp) => {
trace!(
"prefetch completed, remembering blocks {:?} in rel {:?} in LFC",
block_numbers, rel
);
if resp.pages.len() != block_numbers.len() {
error!(
"received unexpected response with {} page images from pageserver for a request for {} pages",
resp.pages.len(),
block_numbers.len(),
);
return Err(libc::EIO);
}
for (page, (blkno, _lsn, _guard)) in resp.pages.into_iter().zip(cache_misses) {
self.cache
.remember_page(&rel, blkno, page.image, not_modified_since, false)
.await;
}
}
Err(err) => {
// FIXME: Could we map the tonic StatusCode to a libc errno in a more fine-grained way? Or pass the error message to the backend
info!("tonic error: {err:?}");
return Err(libc::EIO);
}
}
Ok(())
}
}
impl<T> MetricGroup<T> for CommunicatorWorkerProcessStruct<'_>
impl<T> MetricGroup<T> for CommunicatorWorkerProcessStruct
where
T: Encoding,
CounterState: MetricEncoding<T>,
GaugeState: MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
use measured::metric::MetricFamilyEncoding;
use measured::metric::name::MetricName;
self.lfc_metrics.collect_group_into(enc)?;
self.cache.collect_group_into(enc)?;
self.request_counters
.collect_family_into(MetricName::from_str("request_counters"), enc)?;
self.request_nblocks_counters
.collect_family_into(MetricName::from_str("request_nblocks_counters"), enc)?;
self.allocator_metrics.collect_group_into(enc)?;
Ok(())
self.lfc_metrics.collect_group_into(enc)
}
}

View File

@@ -4,9 +4,9 @@
//! - launch the main loop,
//! - receive IO requests from backends and process them,
//! - write results back to backends.
mod callbacks;
mod control_socket;
mod in_progress_ios;
mod lfc_metrics;
mod logging;
mod main_loop;

View File

@@ -1,21 +1,14 @@
//! Functions called from the C code in the worker process
use std::collections::HashMap;
use std::ffi::{CStr, CString, c_char};
use std::path::PathBuf;
use crate::init::CommunicatorInitStruct;
use crate::worker_process::main_loop;
use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
use pageserver_client_grpc::ShardStripeSize;
/// Launch the communicator's tokio tasks, which do most of the work.
///
/// The caller has initialized the process as a regular PostgreSQL background worker
/// process. The shared memory segment used to communicate with the backends has been
/// allocated and initialized earlier, at postmaster startup, in
/// rcommunicator_shmem_init().
/// process.
///
/// Inputs:
/// `tenant_id` and `timeline_id` can be NULL, if we're been launched in "non-Neon" mode,
@@ -30,63 +23,27 @@ use pageserver_client_grpc::ShardStripeSize;
/// This is called only once in the process, so the returned struct, and error message in
/// case of failure, are simply leaked.
#[unsafe(no_mangle)]
pub extern "C" fn communicator_worker_process_launch(
cis: Box<CommunicatorInitStruct>,
pub extern "C" fn communicator_worker_launch(
tenant_id: *const c_char,
timeline_id: *const c_char,
auth_token: *const c_char,
shard_map: *mut *mut c_char,
nshards: u32,
stripe_size: u32,
file_cache_path: *const c_char,
initial_file_cache_size: u64,
error_p: *mut *const c_char,
) -> Option<&'static CommunicatorWorkerProcessStruct<'static>> {
tracing::warn!("starting threads in rust code");
) -> Option<&'static CommunicatorWorkerProcessStruct> {
// Convert the arguments into more convenient Rust types
let tenant_id = {
let cstr = unsafe { CStr::from_ptr(tenant_id) };
cstr.to_str().expect("assume UTF-8")
};
let timeline_id = {
let cstr = unsafe { CStr::from_ptr(timeline_id) };
cstr.to_str().expect("assume UTF-8")
};
let auth_token = if auth_token.is_null() {
let tenant_id = if tenant_id.is_null() {
None
} else {
let cstr = unsafe { CStr::from_ptr(auth_token) };
let cstr = unsafe { CStr::from_ptr(tenant_id) };
Some(cstr.to_str().expect("assume UTF-8"))
};
let file_cache_path = {
if file_cache_path.is_null() {
None
} else {
let c_str = unsafe { CStr::from_ptr(file_cache_path) };
Some(PathBuf::from(c_str.to_str().unwrap()))
}
};
let shard_map = shard_map_to_hash(nshards, shard_map);
// FIXME: distinguish between unsharded, and sharded with 1 shard
// Also, we might go from unsharded to sharded while the system
// is running.
let stripe_size = if stripe_size > 0 && nshards > 1 {
Some(ShardStripeSize(stripe_size))
} else {
let timeline_id = if timeline_id.is_null() {
None
} else {
let cstr = unsafe { CStr::from_ptr(timeline_id) };
Some(cstr.to_str().expect("assume UTF-8"))
};
// The `init` function does all the work.
let result = main_loop::init(
*cis,
tenant_id,
timeline_id,
auth_token,
shard_map,
stripe_size,
initial_file_cache_size,
file_cache_path,
);
let result = main_loop::init(tenant_id, timeline_id);
// On failure, return the error message to the C caller in *error_p.
match result {
@@ -101,68 +58,3 @@ pub extern "C" fn communicator_worker_process_launch(
}
}
}
#[unsafe(no_mangle)]
pub extern "C" fn communicator_worker_process_launch_legacy(error_p: *mut *const c_char) -> bool {
// The `init` function does all the work.
let result = main_loop::init_legacy();
// On failure, return the error message to the C caller in *error_p.
match result {
Ok(()) => true,
Err(errmsg) => {
let errmsg = CString::new(errmsg).expect("no nuls within error message");
let errmsg = Box::leak(errmsg.into_boxed_c_str());
let p: *const c_char = errmsg.as_ptr();
unsafe { *error_p = p };
false
}
}
}
/// Convert the "shard map" from an array of C strings, indexed by shard no to a rust HashMap
fn shard_map_to_hash(
nshards: u32,
shard_map: *mut *mut c_char,
) -> HashMap<utils::shard::ShardIndex, String> {
use utils::shard::*;
assert!(nshards <= u8::MAX as u32);
let mut result: HashMap<ShardIndex, String> = HashMap::new();
let mut p = shard_map;
for i in 0..nshards {
let c_str = unsafe { CStr::from_ptr(*p) };
p = unsafe { p.add(1) };
let s = c_str.to_str().unwrap();
let k = if nshards > 1 {
ShardIndex::new(ShardNumber(i as u8), ShardCount(nshards as u8))
} else {
ShardIndex::unsharded()
};
result.insert(k, s.into());
}
result
}
/// Inform the rust code about a configuration change
#[unsafe(no_mangle)]
pub extern "C" fn communicator_worker_config_reload(
proc_handle: &'static CommunicatorWorkerProcessStruct<'static>,
file_cache_size: u64,
shard_map: *mut *mut c_char,
nshards: u32,
stripe_size: u32,
) {
proc_handle.runtime.spawn_blocking(move || {
proc_handle.cache.resize_file_cache(file_cache_size as u32);
});
let shard_map = shard_map_to_hash(nshards, shard_map);
let stripe_size = (nshards > 1).then_some(ShardStripeSize(stripe_size));
proc_handle.update_shard_map(shard_map, stripe_size);
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,69 +0,0 @@
/*-------------------------------------------------------------------------
*
* communicator_new.h
* new implementation
*
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*-------------------------------------------------------------------------
*/
#ifndef COMMUNICATOR_NEW_H
#define COMMUNICATOR_NEW_H
#include "storage/buf_internals.h"
#include "lfc_prewarm.h"
#include "neon.h"
#include "neon_pgversioncompat.h"
#include "pagestore_client.h"
/* initialization at postmaster startup */
extern void CommunicatorNewShmemRequest(void);
extern void CommunicatorNewShmemInit(void);
/* initialization at backend startup */
extern void communicator_new_init(void);
/* Read requests */
extern bool communicator_new_rel_exists(NRelFileInfo rinfo, ForkNumber forkNum);
extern BlockNumber communicator_new_rel_nblocks(NRelFileInfo rinfo, ForkNumber forknum);
extern int64 communicator_new_dbsize(Oid dbNode);
extern void communicator_new_readv(NRelFileInfo rinfo, ForkNumber forkNum,
BlockNumber base_blockno,
void **buffers, BlockNumber nblocks);
extern void communicator_new_read_at_lsn_uncached(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
void *buffer, XLogRecPtr request_lsn, XLogRecPtr not_modified_since);
extern void communicator_new_prefetch_register_bufferv(NRelFileInfo rinfo, ForkNumber forkNum,
BlockNumber blockno,
BlockNumber nblocks);
extern bool communicator_new_update_lwlsn_for_block_if_not_cached(NRelFileInfo rinfo, ForkNumber forkNum,
BlockNumber blockno, XLogRecPtr lsn);
extern int communicator_new_read_slru_segment(
SlruKind kind,
uint32_t segno,
neon_request_lsns * request_lsns,
const char *path
);
/* Write requests, to keep the caches up-to-date */
extern void communicator_new_write_page(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
const void *buffer, XLogRecPtr lsn);
extern void communicator_new_rel_extend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
const void *buffer, XLogRecPtr lsn);
extern void communicator_new_rel_zeroextend(NRelFileInfo rinfo, ForkNumber forkNum,
BlockNumber blockno, BlockNumber nblocks,
XLogRecPtr lsn);
extern void communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum, XLogRecPtr lsn);
extern void communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks, XLogRecPtr lsn);
extern void communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum, XLogRecPtr lsn);
extern void communicator_new_update_cached_rel_size(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks, XLogRecPtr lsn);
/* other functions */
extern int32 communicator_new_approximate_working_set_size_seconds(time_t duration, bool reset);
extern struct LfcMetrics communicator_new_get_lfc_metrics_unsafe(void);
extern FileCacheState *communicator_new_get_lfc_state(size_t max_entries);
extern struct LfcStatsEntry *communicator_new_lfc_get_stats(size_t *num_entries);
#endif /* COMMUNICATOR_NEW_H */

View File

@@ -18,9 +18,6 @@
#include <unistd.h>
#include "miscadmin.h"
#if PG_VERSION_NUM >= 150000
#include "access/xlogrecovery.h"
#endif
#include "postmaster/bgworker.h"
#include "postmaster/interrupt.h"
#include "postmaster/postmaster.h"
@@ -32,18 +29,14 @@
#include "tcop/tcopprot.h"
#include "utils/timestamp.h"
#include "communicator_new.h"
#include "communicator_process.h"
#include "file_cache.h"
#include "neon.h"
#include "neon_perf_counters.h"
#include "pagestore_client.h"
/* the rust bindings, generated by cbindgen */
#include "communicator/communicator_bindings.h"
struct CommunicatorInitStruct *cis;
static void pump_logging(struct LoggingReceiver *logging);
PGDLLEXPORT void communicator_new_bgworker_main(Datum main_arg);
@@ -77,13 +70,9 @@ pg_init_communicator_process(void)
void
communicator_new_bgworker_main(Datum main_arg)
{
char **connstrings;
ShardMap shard_map;
uint64 file_cache_size;
struct LoggingReceiver *logging;
const char *errmsg = NULL;
const struct CommunicatorWorkerProcessStruct *proc_handle;
bool success;
/*
* Pretend that this process is a WAL sender. That affects the shutdown
@@ -119,43 +108,12 @@ communicator_new_bgworker_main(Datum main_arg)
logging = communicator_worker_configure_logging();
if (cis != NULL)
{
/* lfc_size_limit is in MBs */
file_cache_size = lfc_size_limit * (1024 * 1024 / BLCKSZ);
if (file_cache_size < 100)
file_cache_size = 100;
if (!parse_shard_map(pageserver_grpc_urls, &shard_map))
{
/* shouldn't happen, as the GUC was verified already */
elog(FATAL, "could not parse neon.pageserver_grpcs_urls");
}
connstrings = palloc(shard_map.num_shards * sizeof(char *));
for (int i = 0; i < shard_map.num_shards; i++)
connstrings[i] = shard_map.connstring[i];
AssignNumShards(shard_map.num_shards);
proc_handle = communicator_worker_process_launch(
cis,
neon_tenant,
neon_timeline,
neon_auth_token,
connstrings,
shard_map.num_shards,
neon_stripe_size,
lfc_path,
file_cache_size,
&errmsg);
pfree(connstrings);
cis = NULL;
success = proc_handle != NULL;
}
else
{
proc_handle = NULL;
success = communicator_worker_process_launch_legacy(&errmsg);
}
if (!success)
proc_handle = communicator_worker_launch(
neon_tenant[0] == '\0' ? NULL : neon_tenant,
neon_timeline[0] == '\0' ? NULL : neon_timeline,
&errmsg
);
if (proc_handle == NULL)
{
/*
* Something went wrong. Before exiting, forward any log messages that
@@ -215,32 +173,6 @@ communicator_new_bgworker_main(Datum main_arg)
{
ConfigReloadPending = false;
ProcessConfigFile(PGC_SIGHUP);
if (proc_handle)
{
/* lfc_size_limit is in MBs */
file_cache_size = lfc_size_limit * (1024 * 1024 / BLCKSZ);
if (file_cache_size < 100)
file_cache_size = 100;
/* Reload pageserver URLs */
if (!parse_shard_map(pageserver_grpc_urls, &shard_map))
{
/* shouldn't happen, as the GUC was verified already */
elog(FATAL, "could not parse neon.pageserver_grpcs_urls");
}
connstrings = palloc(shard_map.num_shards * sizeof(char *));
for (int i = 0; i < shard_map.num_shards; i++)
connstrings[i] = shard_map.connstring[i];
AssignNumShards(shard_map.num_shards);
communicator_worker_config_reload(proc_handle,
file_cache_size,
connstrings,
shard_map.num_shards,
neon_stripe_size);
pfree(connstrings);
}
}
duration = TimestampDifferenceMilliseconds(before, GetCurrentTimestamp());
@@ -339,49 +271,3 @@ callback_set_my_latch_unsafe(void)
{
SetLatch(MyLatch);
}
/*
* FIXME: The logic from neon_get_request_lsns() needs to go here, except for
* the last-written LSN cache stuff, which is managed by the rust code now.
*/
XLogRecPtr
callback_get_request_lsn_unsafe(void)
{
/*
* NB: be very careful with what you do here! This is called from tokio
* threads, so anything tha tries to take LWLocks is unsafe, for example.
*
* RecoveryInProgress() is OK
*/
if (RecoveryInProgress())
{
XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL);
return replay_lsn;
}
else
{
XLogRecPtr flushlsn;
#if PG_VERSION_NUM >= 150000
flushlsn = GetFlushRecPtr(NULL);
#else
flushlsn = GetFlushRecPtr();
#endif
return flushlsn;
}
}
/*
* Get metrics, for the built-in metrics exporter that's part of the
* communicator process.
*/
struct LfcMetrics
callback_get_lfc_metrics_unsafe(void)
{
if (neon_use_communicator_worker)
return communicator_new_get_lfc_metrics_unsafe();
else
return lfc_get_metrics_unsafe();
}

View File

@@ -12,9 +12,6 @@
#ifndef COMMUNICATOR_PROCESS_H
#define COMMUNICATOR_PROCESS_H
extern struct CommunicatorInitStruct *cis;
/* initialization early at postmaster startup */
extern void pg_init_communicator_process(void);
#endif /* COMMUNICATOR_PROCESS_H */

View File

@@ -137,6 +137,15 @@ typedef struct FileCacheEntry
#define N_COND_VARS 64
#define CV_WAIT_TIMEOUT 10
#define MAX_PREWARM_WORKERS 8
typedef struct PrewarmWorkerState
{
uint32 prewarmed_pages;
uint32 skipped_pages;
TimestampTz completed;
} PrewarmWorkerState;
typedef struct FileCacheControl
{
uint64 generation; /* generation is needed to handle correct hash
@@ -182,27 +191,47 @@ typedef struct FileCacheControl
* again.
*/
HyperLogLogState wss_estimation;
/* Prewarmer state */
PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS];
size_t n_prewarm_workers;
size_t n_prewarm_entries;
size_t total_prewarm_pages;
size_t prewarm_batch;
bool prewarm_active;
bool prewarm_canceled;
dsm_handle prewarm_lfc_state_handle;
} FileCacheControl;
#define FILE_CACHE_STATE_MAGIC 0xfcfcfcfc
#define FILE_CACHE_STATE_BITMAP(fcs) ((uint8*)&(fcs)->chunks[(fcs)->n_chunks])
#define FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_chunks) (sizeof(FileCacheState) + (n_chunks)*sizeof(BufferTag) + (((n_chunks) * lfc_blocks_per_chunk)+7)/8)
#define FILE_CACHE_STATE_SIZE(fcs) (sizeof(FileCacheState) + (fcs->n_chunks)*sizeof(BufferTag) + (((fcs->n_chunks) << fcs->chunk_size_log)+7)/8)
static HTAB *lfc_hash;
static int lfc_desc = -1;
static LWLockId lfc_lock;
int lfc_max_size;
int lfc_size_limit;
static int lfc_max_size;
static int lfc_size_limit;
static int lfc_prewarm_limit;
static int lfc_prewarm_batch;
static int lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG;
static int lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK;
char *lfc_path;
static char *lfc_path;
static uint64 lfc_generation;
static FileCacheControl *lfc_ctl;
static bool lfc_do_prewarm;
bool lfc_store_prefetch_result;
bool lfc_prewarm_update_ws_estimation;
bool lfc_do_prewarm;
bool lfc_prewarm_cancel;
bool AmPrewarmWorker;
#define LFC_ENABLED() (lfc_ctl->limit != 0)
PGDLLEXPORT void lfc_prewarm_main(Datum main_arg);
/*
* Close LFC file if opened.
* All backends should close their LFC files once LFC is disabled.
@@ -228,8 +257,6 @@ lfc_switch_off(void)
{
int fd;
Assert(!neon_use_communicator_worker);
if (LFC_ENABLED())
{
HASH_SEQ_STATUS status;
@@ -295,8 +322,6 @@ lfc_maybe_disabled(void)
static bool
lfc_ensure_opened(void)
{
Assert(!neon_use_communicator_worker);
if (lfc_generation != lfc_ctl->generation)
{
lfc_close_file();
@@ -322,9 +347,6 @@ LfcShmemInit(void)
bool found;
static HASHCTL info;
if (neon_use_communicator_worker)
return;
if (lfc_max_size <= 0)
return;
@@ -514,6 +536,7 @@ lfc_init(void)
if (!process_shared_preload_libraries_in_progress)
neon_log(ERROR, "Neon module should be loaded via shared_preload_libraries");
DefineCustomBoolVariable("neon.store_prefetch_result_in_lfc",
"Immediately store received prefetch result in LFC",
NULL,
@@ -585,6 +608,32 @@ lfc_init(void)
lfc_check_chunk_size,
lfc_change_chunk_size,
NULL);
DefineCustomIntVariable("neon.file_cache_prewarm_limit",
"Maximal number of prewarmed chunks",
NULL,
&lfc_prewarm_limit,
INT_MAX, /* no limit by default */
0,
INT_MAX,
PGC_SIGHUP,
0,
NULL,
NULL,
NULL);
DefineCustomIntVariable("neon.file_cache_prewarm_batch",
"Number of pages retrivied by prewarm from page server",
NULL,
&lfc_prewarm_batch,
64,
1,
INT_MAX,
PGC_SIGHUP,
0,
NULL,
NULL,
NULL);
}
/*
@@ -609,7 +658,7 @@ lfc_get_state(size_t max_entries)
uint8* bitmap;
size_t n_pages = 0;
size_t n_entries = Min(max_entries, lfc_ctl->used - lfc_ctl->pinned);
size_t state_size = FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_entries, lfc_blocks_per_chunk);
size_t state_size = FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_entries);
fcs = (FileCacheState*)palloc0(state_size);
SET_VARSIZE(fcs, state_size);
fcs->magic = FILE_CACHE_STATE_MAGIC;
@@ -654,6 +703,278 @@ lfc_get_state(size_t max_entries)
return fcs;
}
/*
* Prewarm LFC cache to the specified state. It uses lfc_prefetch function to load prewarmed page without hoilding shared buffer lock
* and avoid race conditions with other backends.
*/
void
lfc_prewarm(FileCacheState* fcs, uint32 n_workers)
{
size_t fcs_chunk_size_log;
size_t n_entries;
size_t prewarm_batch = Min(lfc_prewarm_batch, readahead_buffer_size);
size_t fcs_size;
uint32_t max_prefetch_pages;
dsm_segment *seg;
BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS];
if (!lfc_ensure_opened())
return;
if (prewarm_batch == 0 || lfc_prewarm_limit == 0 || n_workers == 0)
{
elog(LOG, "LFC: prewarm is disabled");
return;
}
if (n_workers > MAX_PREWARM_WORKERS)
{
elog(ERROR, "LFC: Too much prewarm workers, maximum is %d", MAX_PREWARM_WORKERS);
}
if (fcs == NULL || fcs->n_chunks == 0)
{
elog(LOG, "LFC: nothing to prewarm");
return;
}
if (fcs->magic != FILE_CACHE_STATE_MAGIC)
{
elog(ERROR, "LFC: Invalid file cache state magic: %X", fcs->magic);
}
fcs_size = VARSIZE(fcs);
if (FILE_CACHE_STATE_SIZE(fcs) != fcs_size)
{
elog(ERROR, "LFC: Invalid file cache state size: %u vs. %u", (unsigned)FILE_CACHE_STATE_SIZE(fcs), VARSIZE(fcs));
}
fcs_chunk_size_log = fcs->chunk_size_log;
if (fcs_chunk_size_log > MAX_BLOCKS_PER_CHUNK_LOG)
{
elog(ERROR, "LFC: Invalid chunk size log: %u", fcs->chunk_size_log);
}
n_entries = Min(fcs->n_chunks, lfc_prewarm_limit);
Assert(n_entries != 0);
max_prefetch_pages = n_entries << fcs_chunk_size_log;
if (fcs->n_pages > max_prefetch_pages) {
elog(ERROR, "LFC: Number of pages in file cache state (%d) is more than the limit (%d)", fcs->n_pages, max_prefetch_pages);
}
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
/* Do not prewarm more entries than LFC limit */
if (lfc_ctl->limit <= lfc_ctl->size)
{
elog(LOG, "LFC: skip prewarm because LFC is already filled");
LWLockRelease(lfc_lock);
return;
}
if (lfc_ctl->prewarm_active)
{
LWLockRelease(lfc_lock);
elog(ERROR, "LFC: skip prewarm because another prewarm is still active");
}
lfc_ctl->n_prewarm_entries = n_entries;
lfc_ctl->n_prewarm_workers = n_workers;
lfc_ctl->prewarm_active = true;
lfc_ctl->prewarm_canceled = false;
lfc_ctl->prewarm_batch = prewarm_batch;
memset(lfc_ctl->prewarm_workers, 0, n_workers*sizeof(PrewarmWorkerState));
LWLockRelease(lfc_lock);
/* Calculate total number of pages to be prewarmed */
lfc_ctl->total_prewarm_pages = fcs->n_pages;
seg = dsm_create(fcs_size, 0);
memcpy(dsm_segment_address(seg), fcs, fcs_size);
lfc_ctl->prewarm_lfc_state_handle = dsm_segment_handle(seg);
/* Spawn background workers */
for (uint32 i = 0; i < n_workers; i++)
{
BackgroundWorker worker = {0};
worker.bgw_flags = BGWORKER_SHMEM_ACCESS;
worker.bgw_start_time = BgWorkerStart_ConsistentState;
worker.bgw_restart_time = BGW_NEVER_RESTART;
strcpy(worker.bgw_library_name, "neon");
strcpy(worker.bgw_function_name, "lfc_prewarm_main");
snprintf(worker.bgw_name, BGW_MAXLEN, "LFC prewarm worker %d", i+1);
strcpy(worker.bgw_type, "LFC prewarm worker");
worker.bgw_main_arg = Int32GetDatum(i);
/* must set notify PID to wait for shutdown */
worker.bgw_notify_pid = MyProcPid;
if (!RegisterDynamicBackgroundWorker(&worker, &bgw_handle[i]))
{
ereport(LOG,
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
errmsg("LFC: registering dynamic bgworker prewarm failed"),
errhint("Consider increasing the configuration parameter \"%s\".", "max_worker_processes")));
n_workers = i;
lfc_ctl->prewarm_canceled = true;
break;
}
}
for (uint32 i = 0; i < n_workers; i++)
{
bool interrupted;
do
{
interrupted = false;
PG_TRY();
{
BgwHandleStatus status = WaitForBackgroundWorkerShutdown(bgw_handle[i]);
if (status != BGWH_STOPPED && status != BGWH_POSTMASTER_DIED)
{
elog(LOG, "LFC: Unexpected status of prewarm worker termination: %d", status);
}
}
PG_CATCH();
{
elog(LOG, "LFC: cancel prewarm");
lfc_ctl->prewarm_canceled = true;
interrupted = true;
}
PG_END_TRY();
} while (interrupted);
if (!lfc_ctl->prewarm_workers[i].completed)
{
/* Background worker doesn't set completion time: it means that it was abnormally terminated */
elog(LOG, "LFC: prewarm worker %d failed", i+1);
/* Set completion time to prevent get_prewarm_info from considering this worker as active */
lfc_ctl->prewarm_workers[i].completed = GetCurrentTimestamp();
}
}
dsm_detach(seg);
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
lfc_ctl->prewarm_active = false;
LWLockRelease(lfc_lock);
}
void
lfc_prewarm_main(Datum main_arg)
{
size_t snd_idx = 0, rcv_idx = 0;
size_t n_sent = 0, n_received = 0;
size_t fcs_chunk_size_log;
size_t max_prefetch_pages;
size_t prewarm_batch;
size_t n_workers;
dsm_segment *seg;
FileCacheState* fcs;
uint8* bitmap;
BufferTag tag;
PrewarmWorkerState* ws;
uint32 worker_id = DatumGetInt32(main_arg);
AmPrewarmWorker = true;
pqsignal(SIGTERM, die);
BackgroundWorkerUnblockSignals();
seg = dsm_attach(lfc_ctl->prewarm_lfc_state_handle);
if (seg == NULL)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("could not map dynamic shared memory segment")));
fcs = (FileCacheState*) dsm_segment_address(seg);
prewarm_batch = lfc_ctl->prewarm_batch;
fcs_chunk_size_log = fcs->chunk_size_log;
n_workers = lfc_ctl->n_prewarm_workers;
max_prefetch_pages = lfc_ctl->n_prewarm_entries << fcs_chunk_size_log;
ws = &lfc_ctl->prewarm_workers[worker_id];
bitmap = FILE_CACHE_STATE_BITMAP(fcs);
/* enable prefetch in LFC */
lfc_store_prefetch_result = true;
lfc_do_prewarm = true; /* Flag for lfc_prefetch preventing replacement of existed entries if LFC cache is full */
elog(LOG, "LFC: worker %d start prewarming", worker_id);
while (!lfc_ctl->prewarm_canceled)
{
if (snd_idx < max_prefetch_pages)
{
if ((snd_idx >> fcs_chunk_size_log) % n_workers != worker_id)
{
/* If there are multiple workers, split chunks between them */
snd_idx += 1 << fcs_chunk_size_log;
}
else
{
if (BITMAP_ISSET(bitmap, snd_idx))
{
tag = fcs->chunks[snd_idx >> fcs_chunk_size_log];
tag.blockNum += snd_idx & ((1 << fcs_chunk_size_log) - 1);
if (!BufferTagIsValid(&tag)) {
elog(ERROR, "LFC: Invalid buffer tag: %u", tag.blockNum);
}
if (!lfc_cache_contains(BufTagGetNRelFileInfo(tag), tag.forkNum, tag.blockNum))
{
(void)communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
n_sent += 1;
}
else
{
ws->skipped_pages += 1;
BITMAP_CLR(bitmap, snd_idx);
}
}
snd_idx += 1;
}
}
if (n_sent >= n_received + prewarm_batch || snd_idx == max_prefetch_pages)
{
if (n_received == n_sent && snd_idx == max_prefetch_pages)
{
break;
}
if ((rcv_idx >> fcs_chunk_size_log) % n_workers != worker_id)
{
/* Skip chunks processed by other workers */
rcv_idx += 1 << fcs_chunk_size_log;
continue;
}
/* Locate next block to prefetch */
while (!BITMAP_ISSET(bitmap, rcv_idx))
{
rcv_idx += 1;
}
tag = fcs->chunks[rcv_idx >> fcs_chunk_size_log];
tag.blockNum += rcv_idx & ((1 << fcs_chunk_size_log) - 1);
if (communicator_prefetch_receive(tag))
{
ws->prewarmed_pages += 1;
}
else
{
ws->skipped_pages += 1;
}
rcv_idx += 1;
n_received += 1;
}
}
/* No need to perform prefetch cleanup here because prewarm worker will be terminated and
* connection to PS dropped just after return from this function.
*/
Assert(n_sent == n_received || lfc_ctl->prewarm_canceled);
elog(LOG, "LFC: worker %d complete prewarming: loaded %ld pages", worker_id, (long)n_received);
lfc_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp();
}
void
lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
{
@@ -661,8 +982,6 @@ lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
FileCacheEntry *entry;
uint32 hash;
Assert(!neon_use_communicator_worker);
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return;
@@ -708,8 +1027,6 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
bool found = false;
uint32 hash;
Assert(!neon_use_communicator_worker);
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return false;
@@ -745,8 +1062,6 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
uint32 hash;
int i = 0;
Assert(!neon_use_communicator_worker);
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return 0;
@@ -854,8 +1169,6 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
int blocks_read = 0;
int buf_offset = 0;
Assert(!neon_use_communicator_worker);
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return -1;
@@ -1166,7 +1479,7 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
/* Can't add this chunk - we don't have the space for it */
hash_search_with_hash_value(lfc_hash, &entry->key, hash,
HASH_REMOVE, NULL);
lfc_prewarm_cancel = true; /* cancel prewarm if LFC limit is reached */
lfc_ctl->prewarm_canceled = true; /* cancel prewarm if LFC limit is reached */
return false;
}
@@ -1221,8 +1534,6 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
int chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
Assert(!neon_use_communicator_worker);
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return false;
@@ -1368,8 +1679,6 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
uint32 entry_offset;
int buf_offset = 0;
Assert(!neon_use_communicator_worker);
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return;
@@ -1588,6 +1897,7 @@ lfc_get_stats(size_t *num_entries)
return entries;
}
/*
* Function returning data from the local file cache
* relation node/tablespace/database/blocknum and access_counter
@@ -1691,15 +2001,15 @@ lfc_approximate_working_set_size_seconds(time_t duration, bool reset)
}
/*
* Get metrics, for the built-in metrics exporter that's part of the
* communicator process.
* Get metrics, for the built-in metrics exporter that's part of the communicator
* process.
*
* NB: This is called from a Rust tokio task inside the communicator process.
* Acquiring lwlocks, elog(), allocating memory or anything else non-trivial
* is strictly prohibited here!
*/
struct LfcMetrics
lfc_get_metrics_unsafe(void)
callback_get_lfc_metrics_unsafe(void)
{
struct LfcMetrics result = {
.lfc_cache_size_limit = (int64) lfc_size_limit * 1024 * 1024,
@@ -1720,3 +2030,82 @@ lfc_get_metrics_unsafe(void)
return result;
}
PG_FUNCTION_INFO_V1(get_local_cache_state);
Datum
get_local_cache_state(PG_FUNCTION_ARGS)
{
size_t max_entries = PG_ARGISNULL(0) ? lfc_prewarm_limit : PG_GETARG_INT32(0);
FileCacheState* fcs = lfc_get_state(max_entries);
if (fcs != NULL)
PG_RETURN_BYTEA_P((bytea*)fcs);
else
PG_RETURN_NULL();
}
PG_FUNCTION_INFO_V1(prewarm_local_cache);
Datum
prewarm_local_cache(PG_FUNCTION_ARGS)
{
bytea* state = PG_GETARG_BYTEA_PP(0);
uint32 n_workers = PG_GETARG_INT32(1);
FileCacheState* fcs = (FileCacheState*)state;
lfc_prewarm(fcs, n_workers);
PG_RETURN_NULL();
}
PG_FUNCTION_INFO_V1(get_prewarm_info);
Datum
get_prewarm_info(PG_FUNCTION_ARGS)
{
Datum values[4];
bool nulls[4];
TupleDesc tupdesc;
uint32 prewarmed_pages = 0;
uint32 skipped_pages = 0;
uint32 active_workers = 0;
uint32 total_pages;
size_t n_workers;
if (lfc_size_limit == 0)
PG_RETURN_NULL();
LWLockAcquire(lfc_lock, LW_SHARED);
if (!lfc_ctl || lfc_ctl->n_prewarm_workers == 0)
{
LWLockRelease(lfc_lock);
PG_RETURN_NULL();
}
n_workers = lfc_ctl->n_prewarm_workers;
total_pages = lfc_ctl->total_prewarm_pages;
for (size_t i = 0; i < n_workers; i++)
{
PrewarmWorkerState* ws = &lfc_ctl->prewarm_workers[i];
prewarmed_pages += ws->prewarmed_pages;
skipped_pages += ws->skipped_pages;
active_workers += ws->completed != 0;
}
LWLockRelease(lfc_lock);
tupdesc = CreateTemplateTupleDesc(4);
TupleDescInitEntry(tupdesc, (AttrNumber) 1, "total_pages", INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "prewarmed_pages", INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 3, "skipped_pages", INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 4, "active_workers", INT4OID, -1, 0);
tupdesc = BlessTupleDesc(tupdesc);
MemSet(nulls, 0, sizeof(nulls));
values[0] = Int32GetDatum(total_pages);
values[1] = Int32GetDatum(prewarmed_pages);
values[2] = Int32GetDatum(skipped_pages);
values[3] = Int32GetDatum(active_workers);
PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
}

View File

@@ -11,19 +11,21 @@
#ifndef FILE_CACHE_h
#define FILE_CACHE_h
#include "lfc_prewarm.h"
#include "neon.h"
#include "neon_pgversioncompat.h"
typedef struct FileCacheState
{
int32 vl_len_; /* varlena header (do not touch directly!) */
uint32 magic;
uint32 n_chunks;
uint32 n_pages;
uint16 chunk_size_log;
BufferTag chunks[FLEXIBLE_ARRAY_MEMBER];
/* followed by bitmap */
} FileCacheState;
/* GUCs */
extern bool lfc_store_prefetch_result;
extern int lfc_max_size;
extern int lfc_size_limit;
extern char *lfc_path;
extern bool lfc_do_prewarm;
extern bool lfc_prewarm_cancel;
/* functions for local file cache */
extern void lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks);
@@ -42,12 +44,16 @@ extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum,
extern void lfc_init(void);
extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
const void* buffer, XLogRecPtr lsn);
extern FileCacheState* lfc_get_state(size_t max_entries);
extern LfcStatsEntry *lfc_get_stats(size_t *num_entries);
extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers);
struct LfcMetrics; /* defined in communicator_bindings.h */
extern struct LfcMetrics lfc_get_metrics_unsafe(void);
typedef struct LfcStatsEntry
{
const char *metric_name;
bool isnull;
uint64 value;
} LfcStatsEntry;
extern LfcStatsEntry *lfc_get_stats(size_t *num_entries);
typedef struct
{
@@ -63,6 +69,7 @@ extern LocalCachePagesRec *lfc_local_cache_pages(size_t *num_entries);
extern int32 lfc_approximate_working_set_size_seconds(time_t duration, bool reset);
static inline bool
lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
void *buffer)

View File

@@ -1,671 +0,0 @@
/*-------------------------------------------------------------------------
*
* lfc_prewarm.c
* Functions related to LFC prewarming
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "bitmap.h"
#include "communicator.h"
#include "communicator_new.h"
#include "file_cache.h"
#include "lfc_prewarm.h"
#include "neon.h"
#include "neon_utils.h"
#include "pagestore_client.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "postmaster/bgworker.h"
#include "storage/dsm.h"
#include "tcop/tcopprot.h"
#include "utils/timestamp.h"
#define MAX_PREWARM_WORKERS 8
typedef struct PrewarmWorkerState
{
uint32 prewarmed_pages;
uint32 skipped_pages;
TimestampTz completed;
} PrewarmWorkerState;
typedef struct PrewarmControl
{
/* -1 when not using workers, 0 when no prewarm has been performed */
size_t n_prewarm_workers;
size_t total_prewarm_pages;
bool prewarm_active;
bool prewarm_canceled;
/* These are used in the non-worker mode */
uint32 prewarmed_pages;
uint32 skipped_pages;
TimestampTz completed;
/* These are used with workers */
PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS];
dsm_handle prewarm_lfc_state_handle;
size_t prewarm_batch;
size_t n_prewarm_entries;
} PrewarmControl;
static PrewarmControl *prewarm_ctl;
static int lfc_prewarm_limit;
static int lfc_prewarm_batch;
static LWLockId prewarm_lock;
bool AmPrewarmWorker;
static void lfc_prewarm_with_workers(FileCacheState *fcs, uint32 n_workers);
static void lfc_prewarm_with_async_requests(FileCacheState *fcs);
PGDLLEXPORT void lfc_prewarm_main(Datum main_arg);
void
pg_init_prewarm(void)
{
DefineCustomIntVariable("neon.file_cache_prewarm_limit",
"Maximal number of prewarmed chunks",
NULL,
&lfc_prewarm_limit,
INT_MAX, /* no limit by default */
0,
INT_MAX,
PGC_SIGHUP,
0,
NULL,
NULL,
NULL);
DefineCustomIntVariable("neon.file_cache_prewarm_batch",
"Number of pages retrivied by prewarm from page server",
NULL,
&lfc_prewarm_batch,
64,
1,
INT_MAX,
PGC_SIGHUP,
0,
NULL,
NULL,
NULL);
}
static size_t
PrewarmShmemSize(void)
{
return sizeof(PrewarmControl);
}
void
PrewarmShmemRequest(void)
{
RequestAddinShmemSpace(PrewarmShmemSize());
RequestNamedLWLockTranche("prewarm_lock", 1);
}
void
PrewarmShmemInit(void)
{
bool found;
prewarm_ctl = (PrewarmControl *) ShmemInitStruct("Prewarmer shmem state",
PrewarmShmemSize(),
&found);
if (!found)
{
/* it's zeroed already */
prewarm_lock = (LWLockId) GetNamedLWLockTranche("prewarm_lock");
}
}
static void
validate_fcs(FileCacheState *fcs)
{
size_t fcs_size;
#if 0
size_t fcs_chunk_size_log;
#endif
if (fcs->magic != FILE_CACHE_STATE_MAGIC)
{
elog(ERROR, "LFC: Invalid file cache state magic: %X", fcs->magic);
}
fcs_size = VARSIZE(fcs);
if (FILE_CACHE_STATE_SIZE(fcs) != fcs_size)
{
elog(ERROR, "LFC: Invalid file cache state size: %u vs. %u", (unsigned)FILE_CACHE_STATE_SIZE(fcs), VARSIZE(fcs));
}
/* FIXME */
#if 0
fcs_chunk_size_log = fcs->chunk_size_log;
if (fcs_chunk_size_log > MAX_BLOCKS_PER_CHUNK_LOG)
{
elog(ERROR, "LFC: Invalid chunk size log: %u", fcs->chunk_size_log);
}
#endif
}
/*
* Prewarm LFC cache to the specified state. It uses lfc_prefetch function to
* load prewarmed page without hoilding shared buffer lock and avoid race
* conditions with other backends.
*/
void
lfc_prewarm_with_workers(FileCacheState *fcs, uint32 n_workers)
{
size_t n_entries;
size_t prewarm_batch = Min(lfc_prewarm_batch, readahead_buffer_size);
size_t fcs_size = VARSIZE(fcs);
dsm_segment *seg;
BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS];
Assert(!neon_use_communicator_worker);
if (prewarm_batch == 0 || lfc_prewarm_limit == 0 || n_workers == 0)
{
elog(LOG, "LFC: prewarm is disabled");
return;
}
if (n_workers > MAX_PREWARM_WORKERS)
{
elog(ERROR, "LFC: too many prewarm workers, maximum is %d", MAX_PREWARM_WORKERS);
}
if (fcs == NULL || fcs->n_chunks == 0)
{
elog(LOG, "LFC: nothing to prewarm");
return;
}
n_entries = Min(fcs->n_chunks, lfc_prewarm_limit);
Assert(n_entries != 0);
LWLockAcquire(prewarm_lock, LW_EXCLUSIVE);
/* Do not prewarm more entries than LFC limit */
/* FIXME */
#if 0
if (prewarm_ctl->limit <= prewarm_ctl->size)
{
elog(LOG, "LFC: skip prewarm because LFC is already filled");
LWLockRelease(prewarm_lock);
return;
}
#endif
if (prewarm_ctl->prewarm_active)
{
LWLockRelease(prewarm_lock);
elog(ERROR, "LFC: skip prewarm because another prewarm is still active");
}
prewarm_ctl->n_prewarm_entries = n_entries;
prewarm_ctl->n_prewarm_workers = n_workers;
prewarm_ctl->prewarm_active = true;
prewarm_ctl->prewarm_canceled = false;
prewarm_ctl->prewarm_batch = prewarm_batch;
memset(prewarm_ctl->prewarm_workers, 0, n_workers*sizeof(PrewarmWorkerState));
/* Calculate total number of pages to be prewarmed */
prewarm_ctl->total_prewarm_pages = fcs->n_pages;
LWLockRelease(prewarm_lock);
seg = dsm_create(fcs_size, 0);
memcpy(dsm_segment_address(seg), fcs, fcs_size);
prewarm_ctl->prewarm_lfc_state_handle = dsm_segment_handle(seg);
/* Spawn background workers */
for (uint32 i = 0; i < n_workers; i++)
{
BackgroundWorker worker = {0};
worker.bgw_flags = BGWORKER_SHMEM_ACCESS;
worker.bgw_start_time = BgWorkerStart_ConsistentState;
worker.bgw_restart_time = BGW_NEVER_RESTART;
strcpy(worker.bgw_library_name, "neon");
strcpy(worker.bgw_function_name, "lfc_prewarm_main");
snprintf(worker.bgw_name, BGW_MAXLEN, "LFC prewarm worker %d", i+1);
strcpy(worker.bgw_type, "LFC prewarm worker");
worker.bgw_main_arg = Int32GetDatum(i);
/* must set notify PID to wait for shutdown */
worker.bgw_notify_pid = MyProcPid;
if (!RegisterDynamicBackgroundWorker(&worker, &bgw_handle[i]))
{
ereport(LOG,
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
errmsg("LFC: registering dynamic bgworker prewarm failed"),
errhint("Consider increasing the configuration parameter \"%s\".", "max_worker_processes")));
n_workers = i;
prewarm_ctl->prewarm_canceled = true;
break;
}
}
for (uint32 i = 0; i < n_workers; i++)
{
bool interrupted;
do
{
interrupted = false;
PG_TRY();
{
BgwHandleStatus status = WaitForBackgroundWorkerShutdown(bgw_handle[i]);
if (status != BGWH_STOPPED && status != BGWH_POSTMASTER_DIED)
{
elog(LOG, "LFC: Unexpected status of prewarm worker termination: %d", status);
}
}
PG_CATCH();
{
elog(LOG, "LFC: cancel prewarm");
prewarm_ctl->prewarm_canceled = true;
interrupted = true;
}
PG_END_TRY();
} while (interrupted);
if (!prewarm_ctl->prewarm_workers[i].completed)
{
/* Background worker doesn't set completion time: it means that it was abnormally terminated */
elog(LOG, "LFC: prewarm worker %d failed", i+1);
/* Set completion time to prevent get_prewarm_info from considering this worker as active */
prewarm_ctl->prewarm_workers[i].completed = GetCurrentTimestamp();
}
}
dsm_detach(seg);
LWLockAcquire(prewarm_lock, LW_EXCLUSIVE);
prewarm_ctl->prewarm_active = false;
LWLockRelease(prewarm_lock);
}
void
lfc_prewarm_main(Datum main_arg)
{
size_t snd_idx = 0, rcv_idx = 0;
size_t n_sent = 0, n_received = 0;
size_t fcs_chunk_size_log;
size_t max_prefetch_pages;
size_t prewarm_batch;
size_t n_workers;
dsm_segment *seg;
FileCacheState* fcs;
uint8* bitmap;
BufferTag tag;
PrewarmWorkerState* ws;
uint32 worker_id = DatumGetInt32(main_arg);
Assert(!neon_use_communicator_worker);
AmPrewarmWorker = true;
pqsignal(SIGTERM, die);
BackgroundWorkerUnblockSignals();
seg = dsm_attach(prewarm_ctl->prewarm_lfc_state_handle);
if (seg == NULL)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("could not map dynamic shared memory segment")));
fcs = (FileCacheState*) dsm_segment_address(seg);
prewarm_batch = prewarm_ctl->prewarm_batch;
fcs_chunk_size_log = fcs->chunk_size_log;
n_workers = prewarm_ctl->n_prewarm_workers;
max_prefetch_pages = prewarm_ctl->n_prewarm_entries << fcs_chunk_size_log;
ws = &prewarm_ctl->prewarm_workers[worker_id];
bitmap = FILE_CACHE_STATE_BITMAP(fcs);
/* enable prefetch in LFC */
lfc_store_prefetch_result = true;
lfc_do_prewarm = true; /* Flag for lfc_prefetch preventing replacement of existed entries if LFC cache is full */
elog(LOG, "LFC: worker %d start prewarming", worker_id);
while (!prewarm_ctl->prewarm_canceled)
{
if (snd_idx < max_prefetch_pages)
{
if ((snd_idx >> fcs_chunk_size_log) % n_workers != worker_id)
{
/* If there are multiple workers, split chunks between them */
snd_idx += 1 << fcs_chunk_size_log;
}
else
{
if (BITMAP_ISSET(bitmap, snd_idx))
{
tag = fcs->chunks[snd_idx >> fcs_chunk_size_log];
tag.blockNum += snd_idx & ((1 << fcs_chunk_size_log) - 1);
if (!BufferTagIsValid(&tag))
elog(ERROR, "LFC: Invalid buffer tag: %u", tag.blockNum);
if (!lfc_cache_contains(BufTagGetNRelFileInfo(tag), tag.forkNum, tag.blockNum))
{
(void) communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
n_sent += 1;
}
else
{
ws->skipped_pages += 1;
BITMAP_CLR(bitmap, snd_idx);
}
}
snd_idx += 1;
}
}
if (n_sent >= n_received + prewarm_batch || snd_idx == max_prefetch_pages)
{
if (n_received == n_sent && snd_idx == max_prefetch_pages)
{
break;
}
if ((rcv_idx >> fcs_chunk_size_log) % n_workers != worker_id)
{
/* Skip chunks processed by other workers */
rcv_idx += 1 << fcs_chunk_size_log;
continue;
}
/* Locate next block to prefetch */
while (!BITMAP_ISSET(bitmap, rcv_idx))
{
rcv_idx += 1;
}
tag = fcs->chunks[rcv_idx >> fcs_chunk_size_log];
tag.blockNum += rcv_idx & ((1 << fcs_chunk_size_log) - 1);
if (communicator_prefetch_receive(tag))
{
ws->prewarmed_pages += 1;
}
else
{
ws->skipped_pages += 1;
}
rcv_idx += 1;
n_received += 1;
}
}
/* No need to perform prefetch cleanup here because prewarm worker will be terminated and
* connection to PS dropped just after return from this function.
*/
Assert(n_sent == n_received || prewarm_ctl->prewarm_canceled);
elog(LOG, "LFC: worker %d complete prewarming: loaded %ld pages", worker_id, (long)n_received);
prewarm_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp();
}
/*
* Prewarm LFC cache to the specified state. Uses the new communicator
*
* FIXME: Is there a race condition because we're not holding Postgres
* buffer manager locks?
*/
static void
lfc_prewarm_with_async_requests(FileCacheState *fcs)
{
size_t n_entries;
uint8 *bitmap;
uint64 bitno;
int blocks_per_chunk;
Assert(neon_use_communicator_worker);
if (lfc_prewarm_limit == 0)
{
elog(LOG, "LFC: prewarm is disabled");
return;
}
if (fcs == NULL || fcs->n_chunks == 0)
{
elog(LOG, "LFC: nothing to prewarm");
return;
}
n_entries = Min(fcs->n_chunks, lfc_prewarm_limit);
Assert(n_entries != 0);
PG_TRY();
{
LWLockAcquire(prewarm_lock, LW_EXCLUSIVE);
/* Do not prewarm more entries than LFC limit */
/* FIXME */
#if 0
if (prewarm_ctl->limit <= prewarm_ctl->size)
{
elog(LOG, "LFC: skip prewarm because LFC is already filled");
LWLockRelease(prewarm_lock);
return;
}
#endif
if (prewarm_ctl->prewarm_active)
{
LWLockRelease(prewarm_lock);
elog(ERROR, "LFC: skip prewarm because another prewarm is still active");
}
prewarm_ctl->n_prewarm_entries = n_entries;
prewarm_ctl->n_prewarm_workers = -1;
prewarm_ctl->prewarm_active = true;
prewarm_ctl->prewarm_canceled = false;
/* Calculate total number of pages to be prewarmed */
prewarm_ctl->total_prewarm_pages = fcs->n_pages;
LWLockRelease(prewarm_lock);
elog(LOG, "LFC: start prewarming");
lfc_do_prewarm = true;
lfc_prewarm_cancel = false;
bitmap = FILE_CACHE_STATE_BITMAP(fcs);
blocks_per_chunk = 1 << fcs->chunk_size_log;
bitno = 0;
for (uint32 chunkno = 0; chunkno < fcs->n_chunks; chunkno++)
{
BufferTag *chunk_tag = &fcs->chunks[chunkno];
BlockNumber request_startblkno = InvalidBlockNumber;
BlockNumber request_endblkno;
if (!BufferTagIsValid(chunk_tag))
elog(ERROR, "LFC: Invalid buffer tag: %u", chunk_tag->blockNum);
if (lfc_prewarm_cancel)
{
prewarm_ctl->prewarm_canceled = true;
break;
}
/* take next chunk */
for (int j = 0; j < blocks_per_chunk; j++)
{
BlockNumber blkno = chunk_tag->blockNum + j;
if (BITMAP_ISSET(bitmap, bitno))
{
if (request_startblkno != InvalidBlockNumber)
{
if (request_endblkno == blkno)
{
/* append this block to the request */
request_endblkno++;
}
else
{
/* flush this request, and start new one */
communicator_new_prefetch_register_bufferv(
BufTagGetNRelFileInfo(*chunk_tag),
chunk_tag->forkNum,
request_startblkno,
request_endblkno - request_startblkno
);
request_startblkno = blkno;
request_endblkno = blkno + 1;
}
}
else
{
/* flush this request, if any, and start new one */
if (request_startblkno != InvalidBlockNumber)
{
communicator_new_prefetch_register_bufferv(
BufTagGetNRelFileInfo(*chunk_tag),
chunk_tag->forkNum,
request_startblkno,
request_endblkno - request_startblkno
);
}
request_startblkno = blkno;
request_endblkno = blkno + 1;
}
prewarm_ctl->prewarmed_pages += 1;
}
bitno++;
}
/* flush this request */
communicator_new_prefetch_register_bufferv(
BufTagGetNRelFileInfo(*chunk_tag),
chunk_tag->forkNum,
request_startblkno,
request_endblkno - request_startblkno
);
request_startblkno = request_endblkno = InvalidBlockNumber;
}
elog(LOG, "LFC: complete prewarming: loaded %lu pages", (unsigned long) prewarm_ctl->prewarmed_pages);
prewarm_ctl->completed = GetCurrentTimestamp();
LWLockAcquire(prewarm_lock, LW_EXCLUSIVE);
prewarm_ctl->prewarm_active = false;
LWLockRelease(prewarm_lock);
}
PG_CATCH();
{
elog(LOG, "LFC: cancel prewarm");
prewarm_ctl->prewarm_canceled = true;
prewarm_ctl->prewarm_active = false;
}
PG_END_TRY();
}
PG_FUNCTION_INFO_V1(get_local_cache_state);
Datum
get_local_cache_state(PG_FUNCTION_ARGS)
{
size_t max_entries = PG_ARGISNULL(0) ? lfc_prewarm_limit : PG_GETARG_INT32(0);
FileCacheState* fcs;
if (neon_use_communicator_worker)
fcs = communicator_new_get_lfc_state(max_entries);
else
fcs = lfc_get_state(max_entries);
if (fcs != NULL)
PG_RETURN_BYTEA_P((bytea*)fcs);
else
PG_RETURN_NULL();
}
PG_FUNCTION_INFO_V1(prewarm_local_cache);
Datum
prewarm_local_cache(PG_FUNCTION_ARGS)
{
bytea* state = PG_GETARG_BYTEA_PP(0);
uint32 n_workers = PG_GETARG_INT32(1);
FileCacheState* fcs;
fcs = (FileCacheState *)state;
validate_fcs(fcs);
if (neon_use_communicator_worker)
lfc_prewarm_with_async_requests(fcs);
else
lfc_prewarm_with_workers(fcs, n_workers);
PG_RETURN_NULL();
}
PG_FUNCTION_INFO_V1(get_prewarm_info);
Datum
get_prewarm_info(PG_FUNCTION_ARGS)
{
Datum values[4];
bool nulls[4];
TupleDesc tupdesc;
uint32 prewarmed_pages = 0;
uint32 skipped_pages = 0;
uint32 active_workers = 0;
uint32 total_pages;
if (lfc_size_limit == 0)
PG_RETURN_NULL();
LWLockAcquire(prewarm_lock, LW_SHARED);
if (!prewarm_ctl || prewarm_ctl->n_prewarm_workers == 0)
{
LWLockRelease(prewarm_lock);
PG_RETURN_NULL();
}
if (prewarm_ctl->n_prewarm_workers == -1)
{
total_pages = prewarm_ctl->total_prewarm_pages;
prewarmed_pages = prewarm_ctl->prewarmed_pages;
skipped_pages = prewarm_ctl->skipped_pages;
active_workers = 1;
}
else
{
size_t n_workers;
n_workers = prewarm_ctl->n_prewarm_workers;
total_pages = prewarm_ctl->total_prewarm_pages;
for (size_t i = 0; i < n_workers; i++)
{
PrewarmWorkerState *ws = &prewarm_ctl->prewarm_workers[i];
prewarmed_pages += ws->prewarmed_pages;
skipped_pages += ws->skipped_pages;
active_workers += ws->completed != 0;
}
}
LWLockRelease(prewarm_lock);
tupdesc = CreateTemplateTupleDesc(4);
TupleDescInitEntry(tupdesc, (AttrNumber) 1, "total_pages", INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "prewarmed_pages", INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 3, "skipped_pages", INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 4, "active_workers", INT4OID, -1, 0);
tupdesc = BlessTupleDesc(tupdesc);
MemSet(nulls, 0, sizeof(nulls));
values[0] = Int32GetDatum(total_pages);
values[1] = Int32GetDatum(prewarmed_pages);
values[2] = Int32GetDatum(skipped_pages);
values[3] = Int32GetDatum(active_workers);
PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
}

View File

@@ -1,39 +0,0 @@
/*-------------------------------------------------------------------------
*
* lfc_prewarm.h
* Local File Cache prewarmer
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*-------------------------------------------------------------------------
*/
#ifndef LFC_PREWARM_H
#define LFC_PREWARM_H
#include "storage/buf_internals.h"
typedef struct FileCacheState
{
int32 vl_len_; /* varlena header (do not touch directly!) */
uint32 magic;
uint32 n_chunks;
uint32 n_pages;
uint16 chunk_size_log;
BufferTag chunks[FLEXIBLE_ARRAY_MEMBER];
/* followed by bitmap */
} FileCacheState;
#define FILE_CACHE_STATE_MAGIC 0xfcfcfcfc
#define FILE_CACHE_STATE_BITMAP(fcs) ((uint8*)&(fcs)->chunks[(fcs)->n_chunks])
#define FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_chunks, blocks_per_chunk) (sizeof(FileCacheState) + (n_chunks)*sizeof(BufferTag) + (((n_chunks) * blocks_per_chunk)+7)/8)
#define FILE_CACHE_STATE_SIZE(fcs) (sizeof(FileCacheState) + (fcs->n_chunks)*sizeof(BufferTag) + (((fcs->n_chunks) << fcs->chunk_size_log)+7)/8)
extern void pg_init_prewarm(void);
extern void PrewarmShmemRequest(void);
extern void PrewarmShmemInit(void);
#endif /* LFC_PREWARM_H */

View File

@@ -72,7 +72,6 @@ char *neon_branch_id;
char *neon_endpoint_id;
int32 max_cluster_size;
char *pageserver_connstring;
char *pageserver_grpc_urls;
char *neon_auth_token;
int readahead_buffer_size = 128;
@@ -82,7 +81,7 @@ int neon_protocol_version = 3;
static int neon_compute_mode = 0;
static int max_reconnect_attempts = 60;
int neon_stripe_size;
static int stripe_size;
static int max_sockets;
static int pageserver_response_log_timeout = 10000;
@@ -93,6 +92,13 @@ static int conf_refresh_reconnect_attempt_threshold = 16;
// Hadron: timeout for refresh errors (1 minute)
static uint64 kRefreshErrorTimeoutUSec = 1 * USECS_PER_MINUTE;
typedef struct
{
char connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE];
size_t num_shards;
size_t stripe_size;
} ShardMap;
/*
* PagestoreShmemState is kept in shared memory. It contains the connection
* strings for each shard.
@@ -181,8 +187,6 @@ static void pageserver_disconnect_shard(shardno_t shard_no);
// HADRON
shardno_t get_num_shards(void);
static void AssignShardMap(const char *newval);
static bool
PagestoreShmemIsValid(void)
{
@@ -196,8 +200,8 @@ PagestoreShmemIsValid(void)
* not valid, returns false. The contents of *result are undefined in
* that case, and must not be relied on.
*/
bool
parse_shard_map(const char *connstr, ShardMap *result)
static bool
ParseShardMap(const char *connstr, ShardMap *result)
{
const char *p;
int nshards = 0;
@@ -242,31 +246,24 @@ parse_shard_map(const char *connstr, ShardMap *result)
if (result)
{
result->num_shards = nshards;
result->stripe_size = neon_stripe_size;
result->stripe_size = stripe_size;
}
return true;
}
/* GUC hooks for neon.pageserver_connstring */
static bool
CheckPageserverConnstring(char **newval, void **extra, GucSource source)
{
char *p = *newval;
return parse_shard_map(p, NULL);
return ParseShardMap(p, NULL);
}
static void
AssignPageserverConnstring(const char *newval, void *extra)
{
/*
* 'neon.pageserver_connstring' is ignored if the new communicator is used.
* In that case, the shard map is loaded from 'neon.pageserver_grpc_urls'
* instead, and that happens in the communicator process only.
*/
if (neon_use_communicator_worker)
return;
ShardMap shard_map;
/*
* Only postmaster updates the copy in shared memory.
@@ -274,29 +271,11 @@ AssignPageserverConnstring(const char *newval, void *extra)
if (!PagestoreShmemIsValid() || IsUnderPostmaster)
return;
AssignShardMap(newval);
}
/* GUC hooks for neon.pageserver_connstring */
static bool
CheckPageserverGrpcUrls(char **newval, void **extra, GucSource source)
{
char *p = *newval;
return parse_shard_map(p, NULL);
}
static void
AssignShardMap(const char *newval)
{
ShardMap shard_map;
if (!parse_shard_map(newval, &shard_map))
if (!ParseShardMap(newval, &shard_map))
{
/*
* shouldn't happen, because we already checked the value in
* CheckPageserverConnstring/CheckPageserverGrpcUrls
* CheckPageserverConnstring
*/
elog(ERROR, "could not parse shard map");
}
@@ -315,27 +294,6 @@ AssignShardMap(const char *newval)
}
}
/*
* Set the 'num_shards' variable in shared memory.
*
* This is only used with the new communicator. The new communicator doesn't
* use the shard_map in shared memory, except for the shard count, which is
* needed by get_num_shards() calls in the walproposer. This is called to set
* that. This is only called from the communicator process, at process startup
* or if the configuration is reloaded.
*/
void
AssignNumShards(shardno_t num_shards)
{
Assert(neon_use_communicator_worker);
pg_atomic_add_fetch_u64(&pagestore_shared->begin_update_counter, 1);
pg_write_barrier();
pagestore_shared->shard_map.num_shards = num_shards;
pg_write_barrier();
pg_atomic_add_fetch_u64(&pagestore_shared->end_update_counter, 1);
}
/* BEGIN_HADRON */
/**
* Return the total number of shards seen in the shard map.
@@ -439,10 +397,10 @@ get_shard_number(BufferTag *tag)
#if PG_MAJORVERSION_NUM < 16
hash = murmurhash32(tag->rnode.relNode);
hash = hash_combine(hash, murmurhash32(tag->blockNum / neon_stripe_size));
hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
#else
hash = murmurhash32(tag->relNumber);
hash = hash_combine(hash, murmurhash32(tag->blockNum / neon_stripe_size));
hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
#endif
return hash % n_shards;
@@ -1520,15 +1478,6 @@ pg_init_libpagestore(void)
0, /* no flags required */
CheckPageserverConnstring, AssignPageserverConnstring, NULL);
DefineCustomStringVariable("neon.pageserver_grpc_urls",
"list of gRPC URLs for the page servers",
NULL,
&pageserver_grpc_urls,
"",
PGC_SIGHUP,
0, /* no flags required */
CheckPageserverGrpcUrls, NULL, NULL);
DefineCustomStringVariable("neon.timeline_id",
"Neon timeline_id the server is running on",
NULL,
@@ -1575,7 +1524,7 @@ pg_init_libpagestore(void)
DefineCustomIntVariable("neon.stripe_size",
"sharding stripe size",
NULL,
&neon_stripe_size,
&stripe_size,
2048, 1, INT_MAX,
PGC_SIGHUP,
GUC_UNIT_BLOCKS,
@@ -1694,7 +1643,7 @@ pg_init_libpagestore(void)
if (neon_auth_token)
neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable");
if (pageserver_connstring[0] || pageserver_grpc_urls[0])
if (pageserver_connstring[0])
{
neon_log(PageStoreTrace, "set neon_smgr hook");
smgr_hook = smgr_neon;

View File

@@ -21,7 +21,6 @@
#include "replication/logicallauncher.h"
#include "replication/slot.h"
#include "replication/walsender.h"
#include "storage/ipc.h"
#include "storage/proc.h"
#include "storage/ipc.h"
#include "funcapi.h"
@@ -32,7 +31,6 @@
#include "utils/guc_tables.h"
#include "communicator.h"
#include "communicator_new.h"
#include "communicator_process.h"
#include "extension_server.h"
#include "file_cache.h"
@@ -475,16 +473,6 @@ _PG_init(void)
load_file("$libdir/neon_rmgr", false);
#endif
DefineCustomBoolVariable(
"neon.use_communicator_worker",
"Uses the communicator worker implementation",
NULL,
&neon_use_communicator_worker,
true,
PGC_POSTMASTER,
0,
NULL, NULL, NULL);
if (lakebase_mode) {
prev_emit_log_hook = emit_log_hook;
emit_log_hook = DatabricksSqlErrorHookImpl;
@@ -524,14 +512,12 @@ _PG_init(void)
pg_init_libpagestore();
relsize_hash_init();
lfc_init();
pg_init_prewarm();
pg_init_walproposer();
pg_init_lwlsncache();
init_lwlsncache();
pg_init_communicator_process();
pg_init_communicator();
Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
InitUnstableExtensionsSupport();
@@ -737,10 +723,7 @@ approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
duration = PG_ARGISNULL(0) ? (time_t) -1 : PG_GETARG_INT32(0);
if (neon_use_communicator_worker)
dc = communicator_new_approximate_working_set_size_seconds(duration, false);
else
dc = lfc_approximate_working_set_size_seconds(duration, false);
dc = lfc_approximate_working_set_size_seconds(duration, false);
if (dc < 0)
PG_RETURN_NULL();
else
@@ -753,10 +736,7 @@ approximate_working_set_size(PG_FUNCTION_ARGS)
bool reset = PG_GETARG_BOOL(0);
int32 dc;
if (neon_use_communicator_worker)
dc = communicator_new_approximate_working_set_size_seconds(-1, reset);
else
dc = lfc_approximate_working_set_size_seconds(-1, reset);
dc = lfc_approximate_working_set_size_seconds(-1, reset);
if (dc < 0)
PG_RETURN_NULL();
else
@@ -774,10 +754,7 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
InitMaterializedSRF(fcinfo, 0);
/* lfc_get_stats() does all the heavy lifting */
if (neon_use_communicator_worker)
entries = communicator_new_lfc_get_stats(&num_entries);
else
entries = lfc_get_stats(&num_entries);
entries = lfc_get_stats(&num_entries);
/* Convert the LfcStatsEntrys to a result set */
for (size_t i = 0; i < num_entries; i++)
@@ -851,13 +828,11 @@ neon_shmem_request_hook(void)
#endif
LfcShmemRequest();
PrewarmShmemRequest();
NeonPerfCountersShmemRequest();
PagestoreShmemRequest();
RelsizeCacheShmemRequest();
WalproposerShmemRequest();
LwLsnCacheShmemRequest();
CommunicatorNewShmemRequest();
}
@@ -875,7 +850,6 @@ neon_shmem_startup_hook(void)
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
LfcShmemInit();
PrewarmShmemInit();
NeonPerfCountersShmemInit();
if (lakebase_mode) {
DatabricksMetricsShmemInit();
@@ -884,7 +858,6 @@ neon_shmem_startup_hook(void)
RelsizeCacheShmemInit();
WalproposerShmemInit();
LwLsnCacheShmemInit();
CommunicatorNewShmemInit();
#if PG_MAJORVERSION_NUM >= 17
WAIT_EVENT_NEON_LFC_MAINTENANCE = WaitEventExtensionNew("Neon/FileCache_Maintenance");

View File

@@ -85,11 +85,5 @@ extern void WalproposerShmemInit(void);
extern void LwLsnCacheShmemInit(void);
extern void NeonPerfCountersShmemInit(void);
typedef struct LfcStatsEntry
{
const char *metric_name;
bool isnull;
uint64 value;
} LfcStatsEntry;
#endif /* NEON_H */

View File

@@ -85,54 +85,12 @@ static set_lwlsn_db_hook_type prev_set_lwlsn_db_hook = NULL;
static void neon_set_max_lwlsn(XLogRecPtr lsn);
void
pg_init_lwlsncache(void)
init_lwlsncache(void)
{
if (!process_shared_preload_libraries_in_progress)
ereport(ERROR, errcode(ERRCODE_INTERNAL_ERROR), errmsg("Loading of shared preload libraries is not in progress. Exiting"));
lwlc_register_gucs();
}
void
LwLsnCacheShmemRequest(void)
{
Size requested_size;
if (neon_use_communicator_worker)
return;
requested_size = sizeof(LwLsnCacheCtl);
requested_size += hash_estimate_size(lwlsn_cache_size, sizeof(LastWrittenLsnCacheEntry));
RequestAddinShmemSpace(requested_size);
}
void
LwLsnCacheShmemInit(void)
{
static HASHCTL info;
bool found;
if (neon_use_communicator_worker)
return;
Assert(lwlsn_cache_size > 0);
info.keysize = sizeof(BufferTag);
info.entrysize = sizeof(LastWrittenLsnCacheEntry);
lastWrittenLsnCache = ShmemInitHash("last_written_lsn_cache",
lwlsn_cache_size, lwlsn_cache_size,
&info,
HASH_ELEM | HASH_BLOBS);
LwLsnCache = ShmemInitStruct("neon/LwLsnCacheCtl", sizeof(LwLsnCacheCtl), &found);
// Now set the size in the struct
LwLsnCache->lastWrittenLsnCacheSize = lwlsn_cache_size;
if (found) {
return;
}
dlist_init(&LwLsnCache->lastWrittenLsnLRU);
LwLsnCache->maxLastWrittenLsn = GetRedoRecPtr();
prev_set_lwlsn_block_range_hook = set_lwlsn_block_range_hook;
set_lwlsn_block_range_hook = neon_set_lwlsn_block_range;
@@ -148,6 +106,41 @@ LwLsnCacheShmemInit(void)
set_lwlsn_db_hook = neon_set_lwlsn_db;
}
void
LwLsnCacheShmemRequest(void)
{
Size requested_size = sizeof(LwLsnCacheCtl);
requested_size += hash_estimate_size(lwlsn_cache_size, sizeof(LastWrittenLsnCacheEntry));
RequestAddinShmemSpace(requested_size);
}
void
LwLsnCacheShmemInit(void)
{
static HASHCTL info;
bool found;
if (lwlsn_cache_size > 0)
{
info.keysize = sizeof(BufferTag);
info.entrysize = sizeof(LastWrittenLsnCacheEntry);
lastWrittenLsnCache = ShmemInitHash("last_written_lsn_cache",
lwlsn_cache_size, lwlsn_cache_size,
&info,
HASH_ELEM | HASH_BLOBS);
LwLsnCache = ShmemInitStruct("neon/LwLsnCacheCtl", sizeof(LwLsnCacheCtl), &found);
// Now set the size in the struct
LwLsnCache->lastWrittenLsnCacheSize = lwlsn_cache_size;
if (found) {
return;
}
}
dlist_init(&LwLsnCache->lastWrittenLsnLRU);
LwLsnCache->maxLastWrittenLsn = GetRedoRecPtr();
}
/*
* neon_get_lwlsn -- Returns maximal LSN of written page.
* It returns an upper bound for the last written LSN of a given page,
@@ -162,7 +155,6 @@ neon_get_lwlsn(NRelFileInfo rlocator, ForkNumber forknum, BlockNumber blkno)
XLogRecPtr lsn;
LastWrittenLsnCacheEntry* entry;
Assert(!neon_use_communicator_worker);
Assert(LwLsnCache->lastWrittenLsnCacheSize != 0);
LWLockAcquire(LastWrittenLsnLock, LW_SHARED);
@@ -215,10 +207,7 @@ neon_get_lwlsn(NRelFileInfo rlocator, ForkNumber forknum, BlockNumber blkno)
return lsn;
}
static void
neon_set_max_lwlsn(XLogRecPtr lsn)
{
Assert(!neon_use_communicator_worker);
static void neon_set_max_lwlsn(XLogRecPtr lsn) {
LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE);
LwLsnCache->maxLastWrittenLsn = lsn;
LWLockRelease(LastWrittenLsnLock);
@@ -239,7 +228,6 @@ neon_get_lwlsn_v(NRelFileInfo relfilenode, ForkNumber forknum,
LastWrittenLsnCacheEntry* entry;
XLogRecPtr lsn;
Assert(!neon_use_communicator_worker);
Assert(LwLsnCache->lastWrittenLsnCacheSize != 0);
Assert(nblocks > 0);
Assert(PointerIsValid(lsns));
@@ -388,8 +376,6 @@ SetLastWrittenLSNForBlockRangeInternal(XLogRecPtr lsn,
XLogRecPtr
neon_set_lwlsn_block_range(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks)
{
Assert(!neon_use_communicator_worker);
if (lsn == InvalidXLogRecPtr || n_blocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0)
return lsn;
@@ -426,8 +412,6 @@ neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode,
Oid dbOid = NInfoGetDbOid(relfilenode);
Oid relNumber = NInfoGetRelNumber(relfilenode);
Assert(!neon_use_communicator_worker);
if (lsns == NULL || nblocks == 0 || LwLsnCache->lastWrittenLsnCacheSize == 0 ||
NInfoGetRelNumber(relfilenode) == InvalidOid)
return InvalidXLogRecPtr;
@@ -485,7 +469,6 @@ neon_set_lwlsn_block_v(const XLogRecPtr *lsns, NRelFileInfo relfilenode,
XLogRecPtr
neon_set_lwlsn_block(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum, BlockNumber blkno)
{
Assert(!neon_use_communicator_worker);
return neon_set_lwlsn_block_range(lsn, rlocator, forknum, blkno, 1);
}
@@ -495,7 +478,6 @@ neon_set_lwlsn_block(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum,
XLogRecPtr
neon_set_lwlsn_relation(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum)
{
Assert(!neon_use_communicator_worker);
return neon_set_lwlsn_block(lsn, rlocator, forknum, REL_METADATA_PSEUDO_BLOCKNO);
}
@@ -506,8 +488,6 @@ XLogRecPtr
neon_set_lwlsn_db(XLogRecPtr lsn)
{
NRelFileInfo dummyNode = {InvalidOid, InvalidOid, InvalidOid};
Assert(!neon_use_communicator_worker);
return neon_set_lwlsn_block(lsn, dummyNode, MAIN_FORKNUM, 0);
}

View File

@@ -3,7 +3,7 @@
#include "neon_pgversioncompat.h"
extern void pg_init_lwlsncache(void);
void init_lwlsncache(void);
/* Hooks */
XLogRecPtr neon_get_lwlsn(NRelFileInfo rlocator, ForkNumber forknum, BlockNumber blkno);
@@ -14,4 +14,4 @@ XLogRecPtr neon_set_lwlsn_block(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumbe
XLogRecPtr neon_set_lwlsn_relation(XLogRecPtr lsn, NRelFileInfo rlocator, ForkNumber forknum);
XLogRecPtr neon_set_lwlsn_db(XLogRecPtr lsn);
#endif /* NEON_LWLSNCACHE_H */
#endif /* NEON_LWLSNCACHE_H */

View File

@@ -237,27 +237,15 @@ extern void prefetch_on_ps_disconnect(void);
extern page_server_api *page_server;
extern char *pageserver_connstring;
extern char *pageserver_grpc_urls;
extern int flush_every_n_requests;
extern int readahead_buffer_size;
extern char *neon_timeline;
extern char *neon_tenant;
extern int32 max_cluster_size;
extern int neon_protocol_version;
extern int neon_stripe_size;
typedef struct
{
char connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE];
size_t num_shards;
size_t stripe_size;
} ShardMap;
extern bool parse_shard_map(const char *connstr, ShardMap *result);
extern shardno_t get_shard_number(BufferTag* tag);
extern void AssignNumShards(shardno_t num_shards);
extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo);
extern void smgr_init_neon(void);
extern void readahead_buffer_resize(int newsize, void *extra);
@@ -302,7 +290,6 @@ extern int64 neon_dbsize(Oid dbNode);
extern void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum,
BlockNumber blkno, neon_request_lsns *output,
BlockNumber nblocks);
extern XLogRecPtr neon_get_write_lsn(void);
/* utils for neon relsize cache */
extern void relsize_hash_init(void);

View File

@@ -62,7 +62,6 @@
#include "bitmap.h"
#include "communicator.h"
#include "communicator_new.h"
#include "file_cache.h"
#include "neon.h"
#include "neon_lwlsncache.h"
@@ -302,7 +301,7 @@ neon_wallog_pagev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
*/
lsns[batch_size++] = lsn;
if (batch_size >= BLOCK_BATCH_SIZE && !neon_use_communicator_worker)
if (batch_size >= BLOCK_BATCH_SIZE)
{
neon_set_lwlsn_block_v(lsns, InfoFromSMgrRel(reln), forknum,
batch_blockno,
@@ -312,7 +311,7 @@ neon_wallog_pagev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
}
}
if (batch_size != 0 && !neon_use_communicator_worker)
if (batch_size != 0)
{
neon_set_lwlsn_block_v(lsns, InfoFromSMgrRel(reln), forknum,
batch_blockno,
@@ -437,17 +436,11 @@ neon_wallog_page(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, co
forknum, LSN_FORMAT_ARGS(lsn))));
}
if (!neon_use_communicator_worker)
{
/*
* Remember the LSN on this page. When we read the page again, we must
* read the same or newer version of it.
*
* (With the new communicator, the caller will make a write-request
* for this page, which updates the last-written LSN too)
*/
neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forknum, blocknum);
}
/*
* Remember the LSN on this page. When we read the page again, we must
* read the same or newer version of it.
*/
neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forknum, blocknum);
}
/*
@@ -504,60 +497,6 @@ nm_adjust_lsn(XLogRecPtr lsn)
return lsn;
}
/*
* Get a LSN to use to stamp an operation like relation create or truncate.
* On operations on individual pages we use the LSN of the page, but when
* e.g. smgrcreate() is called, we have to do something else.
*/
XLogRecPtr
neon_get_write_lsn(void)
{
XLogRecPtr lsn;
if (RecoveryInProgress())
{
/*
* FIXME: v14 doesn't have GetCurrentReplayRecPtr(). Options:
* - add it in our fork
* - store a magic value that means that you must use
* current latest possible LSN at the time that the request
* on this thing is made again (or some other recent enough
* lsn).
*/
#if PG_VERSION_NUM >= 150000
lsn = GetCurrentReplayRecPtr(NULL);
#else
lsn = GetXLogReplayRecPtr(NULL); /* FIXME: this is wrong, see above */
#endif
}
else
lsn = GetXLogInsertRecPtr();
/*
* If the insert LSN points to just after page header, round it down to
* the beginning of the page, because the page header might not have been
* inserted to the WAL yet, and if we tried to flush it, the WAL flushing
* code gets upset.
*/
{
int segoff;
segoff = XLogSegmentOffset(lsn, wal_segment_size);
if (segoff == SizeOfXLogLongPHD)
{
lsn = lsn - segoff;
}
else
{
int offset = lsn % XLOG_BLCKSZ;
if (offset == SizeOfXLogShortPHD)
lsn = lsn - offset;
}
}
return lsn;
}
/*
* Return LSN for requesting pages and number of blocks from page server
@@ -570,7 +509,6 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
{
XLogRecPtr last_written_lsns[PG_IOV_MAX];
Assert(!neon_use_communicator_worker);
Assert(nblocks <= PG_IOV_MAX);
neon_get_lwlsn_v(rinfo, forknum, blkno, (int) nblocks, last_written_lsns);
@@ -802,6 +740,11 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks))
{
return true;
}
/*
* \d+ on a view calls smgrexists with 0/0/0 relfilenode. The page server
* will error out if you check that, because the whole dbdir for
@@ -825,20 +768,10 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
return false;
}
if (neon_use_communicator_worker)
return communicator_new_rel_exists(InfoFromSMgrRel(reln), forkNum);
else
{
if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks))
{
return true;
}
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns);
}
return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns);
}
/*
@@ -896,53 +829,16 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
* relation. Currently, we don't call SetLastWrittenLSN() when a new
* relation created, so if we didn't remember the size in the relsize
* cache, we might call smgrnblocks() on the newly-created relation before
* the creation WAL record has been received by the page server.
*
* XXX: with the new communicator, similar considerations apply. However,
* during replay, neon_get_write_lsn() returns the (end-)LSN of the record
* that's being replayed, so we should not have the correctness issue
* mentioned in previous paragraph.
* the creation WAL record hass been received by the page server.
*/
if (neon_use_communicator_worker)
if (isRedo)
{
XLogRecPtr lsn = neon_get_write_lsn();
if (isRedo)
{
/*
* TODO: the protocol can check for existence and get the relsize
* in one roundtrip. Add a similar call to the
* backend<->communicator API. (The size is cached on the
* rel_exists call, so this does only one roundtrip to the
* pageserver, but two function calls and two cache lookups.)
*/
if (!communicator_new_rel_exists(InfoFromSMgrRel(reln), forkNum))
{
communicator_new_rel_create(InfoFromSMgrRel(reln), forkNum, lsn);
reln->smgr_cached_nblocks[forkNum] = 0;
}
else
{
BlockNumber nblocks;
nblocks = communicator_new_rel_nblocks(InfoFromSMgrRel(reln), forkNum);
reln->smgr_cached_nblocks[forkNum] = nblocks;
}
}
else
communicator_new_rel_create(InfoFromSMgrRel(reln), forkNum, lsn);
update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
get_cached_relsize(InfoFromSMgrRel(reln), forkNum,
&reln->smgr_cached_nblocks[forkNum]);
}
else
{
if (isRedo)
{
update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
get_cached_relsize(InfoFromSMgrRel(reln), forkNum,
&reln->smgr_cached_nblocks[forkNum]);
}
else
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
}
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
if (debug_compare_local)
{
@@ -978,17 +874,9 @@ neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo)
* unlink, it won't do any harm if the file doesn't exist.
*/
mdunlink(rinfo, forkNum, isRedo);
if (!NRelFileInfoBackendIsTemp(rinfo))
{
if (neon_use_communicator_worker)
{
XLogRecPtr lsn = neon_get_write_lsn();
communicator_new_rel_unlink(InfoFromNInfoB(rinfo), forkNum, lsn);
}
else
forget_cached_relsize(InfoFromNInfoB(rinfo), forkNum);
forget_cached_relsize(InfoFromNInfoB(rinfo), forkNum);
}
}
@@ -1011,7 +899,6 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
#endif
{
XLogRecPtr lsn;
bool lsn_was_zero;
BlockNumber n_blocks = 0;
switch (reln->smgr_relpersistence)
@@ -1069,6 +956,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);
neon_wallog_page(reln, forkNum, blkno, buffer, false);
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);
lsn = PageGetLSN((Page) buffer);
neon_log(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
@@ -1076,6 +964,14 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
forkNum, blkno,
(uint32) (lsn >> 32), (uint32) lsn);
lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
if (debug_compare_local)
{
if (IS_LOCAL_REL(reln))
mdextend(reln, forkNum, blkno, buffer, skipFsync);
}
/*
* smgr_extend is often called with an all-zeroes page, so
* lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
@@ -1083,51 +979,20 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
* it is eventually evicted from the buffer cache. But we need a valid LSN
* to the relation metadata update now.
*/
lsn_was_zero = (lsn == InvalidXLogRecPtr);
if (lsn_was_zero)
if (lsn == InvalidXLogRecPtr)
{
lsn = GetXLogInsertRecPtr();
if (neon_use_communicator_worker)
{
communicator_new_rel_extend(InfoFromSMgrRel(reln), forkNum, blkno, (const void *) buffer, lsn);
if (debug_compare_local)
{
if (IS_LOCAL_REL(reln))
mdextend(reln, forkNum, blkno, buffer, skipFsync);
}
}
else
{
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);
lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
if (debug_compare_local)
{
if (IS_LOCAL_REL(reln))
mdextend(reln, forkNum, blkno, buffer, skipFsync);
}
/*
* smgr_extend is often called with an all-zeroes page, so
* lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
* later, after it has been initialized with the real page contents, and
* it is eventually evicted from the buffer cache. But we need a valid LSN
* to the relation metadata update now.
*/
if (lsn_was_zero)
neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno);
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno);
}
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
}
#if PG_MAJORVERSION_NUM >= 16
static void
neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber start_block,
neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
int nblocks, bool skipFsync)
{
const PGIOAlignedBlock buffer = {0};
BlockNumber blocknum = start_block;
int remblocks = nblocks;
XLogRecPtr lsn = 0;
@@ -1210,14 +1075,11 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber start_block,
lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI);
if (!neon_use_communicator_worker)
for (int i = 0; i < count; i++)
{
for (int i = 0; i < count; i++)
{
lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum,
blocknum + i);
}
lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum,
blocknum + i);
}
blocknum += count;
@@ -1226,15 +1088,8 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber start_block,
Assert(lsn != 0);
if (neon_use_communicator_worker)
{
communicator_new_rel_zeroextend(InfoFromSMgrRel(reln), forkNum, start_block, nblocks, lsn);
}
else
{
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
}
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
}
#endif
@@ -1294,12 +1149,6 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (neon_use_communicator_worker)
{
communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, nblocks);
return false;
}
tag.spcOid = reln->smgr_rlocator.locator.spcOid;
tag.dbOid = reln->smgr_rlocator.locator.dbOid;
tag.relNumber = reln->smgr_rlocator.locator.relNumber;
@@ -1326,8 +1175,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
blocknum += iterblocks;
}
if (!neon_use_communicator_worker)
communicator_prefetch_pump_state();
communicator_prefetch_pump_state();
return false;
}
@@ -1340,6 +1188,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
static bool
neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
{
BufferTag tag;
switch (reln->smgr_relpersistence)
{
case 0: /* probably shouldn't happen, but ignore it */
@@ -1354,25 +1204,17 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (neon_use_communicator_worker)
{
communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, 1);
}
else
{
BufferTag tag;
if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
return false;
if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
return false;
tag.forkNum = forknum;
tag.blockNum = blocknum;
tag.forkNum = forknum;
tag.blockNum = blocknum;
CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
communicator_prefetch_pump_state();
}
communicator_prefetch_pump_state();
return false;
}
@@ -1416,8 +1258,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
*/
neon_log(SmgrTrace, "writeback noop");
if (!neon_use_communicator_worker)
communicator_prefetch_pump_state();
communicator_prefetch_pump_state();
if (debug_compare_local)
{
@@ -1434,14 +1275,7 @@ void
neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
neon_request_lsns request_lsns, void *buffer)
{
if (neon_use_communicator_worker)
{
// FIXME: request_lsns is ignored. That affects the neon_test_utils callers.
// Add the capability to specify the LSNs explicitly, for the sake of neon_test_utils ?
communicator_new_read_at_lsn_uncached(rinfo, forkNum, blkno, buffer, request_lsns.request_lsn, request_lsns.not_modified_since);
}
else
communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
}
static void
@@ -1567,55 +1401,47 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (neon_use_communicator_worker)
/* Try to read PS results if they are available */
communicator_prefetch_pump_state();
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
present = 0;
bufferp = buffer;
if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
{
communicator_new_readv(InfoFromSMgrRel(reln), forkNum, blkno,
(void *) &buffer, 1);
/* Prefetch hit */
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
{
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH)
{
return;
}
}
else
/* Try to read from local file cache */
if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
{
/* Try to read PS results if they are available */
communicator_prefetch_pump_state();
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
present = 0;
bufferp = buffer;
if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
MyNeonCounters->file_cache_hits_total++;
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
{
/* Prefetch hit */
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
{
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH)
{
return;
}
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
}
/* Try to read from local file cache */
if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC)
{
MyNeonCounters->file_cache_hits_total++;
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
{
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC)
{
return;
}
return;
}
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
/*
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
*/
communicator_prefetch_pump_state();
}
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
/*
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
*/
communicator_prefetch_pump_state();
if (debug_compare_local)
{
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
@@ -1678,67 +1504,59 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
nblocks, PG_IOV_MAX);
/* Try to read PS results if they are available */
if (!neon_use_communicator_worker)
communicator_prefetch_pump_state();
communicator_prefetch_pump_state();
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
request_lsns, nblocks);
memset(read_pages, 0, sizeof(read_pages));
if (neon_use_communicator_worker)
prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
blocknum, request_lsns, nblocks,
buffers, read_pages);
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
{
communicator_new_readv(InfoFromSMgrRel(reln), forknum, blocknum,
buffers, nblocks);
compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
}
else
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH && prefetch_result == nblocks)
{
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
request_lsns, nblocks);
prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
blocknum, request_lsns, nblocks,
buffers, read_pages);
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
{
compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH && prefetch_result == nblocks)
{
return;
}
if (debug_compare_local > DEBUG_COMPARE_LOCAL_PREFETCH)
{
memset(read_pages, 0, sizeof(read_pages));
}
/* Try to read from local file cache */
lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
nblocks, read_pages);
if (lfc_result > 0)
MyNeonCounters->file_cache_hits_total += lfc_result;
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
{
compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC && prefetch_result + lfc_result == nblocks)
{
/* Read all blocks from LFC, so we're done */
return;
}
if (debug_compare_local > DEBUG_COMPARE_LOCAL_LFC)
{
memset(read_pages, 0, sizeof(read_pages));
}
communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
buffers, nblocks, read_pages);
/*
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
*/
communicator_prefetch_pump_state();
return;
}
if (debug_compare_local > DEBUG_COMPARE_LOCAL_PREFETCH)
{
memset(read_pages, 0, sizeof(read_pages));
}
/* Try to read from local file cache */
lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
nblocks, read_pages);
if (lfc_result > 0)
MyNeonCounters->file_cache_hits_total += lfc_result;
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
{
compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC && prefetch_result + lfc_result == nblocks)
{
/* Read all blocks from LFC, so we're done */
return;
}
if (debug_compare_local > DEBUG_COMPARE_LOCAL_LFC)
{
memset(read_pages, 0, sizeof(read_pages));
}
communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
buffers, nblocks, read_pages);
/*
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
*/
communicator_prefetch_pump_state();
if (debug_compare_local)
{
@@ -1839,16 +1657,9 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
forknum, blocknum,
(uint32) (lsn >> 32), (uint32) lsn);
if (neon_use_communicator_worker)
{
communicator_new_write_page(InfoFromSMgrRel(reln), forknum, blocknum, buffer, lsn);
}
else
{
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
communicator_prefetch_pump_state();
}
communicator_prefetch_pump_state();
if (debug_compare_local)
{
@@ -1909,21 +1720,9 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false);
if (neon_use_communicator_worker)
{
for (int i = 0; i < nblocks; i++)
{
XLogRecPtr lsn = PageGetLSN((Page) buffers[i]);
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
communicator_new_write_page(InfoFromSMgrRel(reln), forknum, blkno + i, buffers[i], lsn);
}
}
else
{
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
communicator_prefetch_pump_state();
}
communicator_prefetch_pump_state();
if (debug_compare_local)
{
@@ -1964,26 +1763,19 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (neon_use_communicator_worker)
if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
{
n_blocks = communicator_new_rel_nblocks(InfoFromSMgrRel(reln), forknum);
neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, n_blocks);
return n_blocks;
}
else
{
if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
{
neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, n_blocks);
return n_blocks;
}
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns);
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
}
n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns);
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
@@ -2004,17 +1796,10 @@ neon_dbsize(Oid dbNode)
neon_request_lsns request_lsns;
NRelFileInfo dummy_node = {0};
if (neon_use_communicator_worker)
{
db_size = communicator_new_dbsize(dbNode);
}
else
{
neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
db_size = communicator_dbsize(dbNode, &request_lsns);
}
db_size = communicator_dbsize(dbNode, &request_lsns);
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
@@ -2028,6 +1813,8 @@ neon_dbsize(Oid dbNode)
static void
neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks)
{
XLogRecPtr lsn;
switch (reln->smgr_relpersistence)
{
case 0:
@@ -2051,45 +1838,34 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (neon_use_communicator_worker)
{
XLogRecPtr lsn = neon_get_write_lsn();
set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
communicator_new_rel_truncate(InfoFromSMgrRel(reln), forknum, nblocks, lsn);
}
else
{
XLogRecPtr lsn;
/*
* Truncating a relation drops all its buffers from the buffer cache
* without calling smgrwrite() on them. But we must account for that in
* our tracking of last-written-LSN all the same: any future smgrnblocks()
* request must return the new size after the truncation. We don't know
* what the LSN of the truncation record was, so be conservative and use
* the most recently inserted WAL record's LSN.
*/
lsn = GetXLogInsertRecPtr();
lsn = nm_adjust_lsn(lsn);
set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
/*
* Flush it, too. We don't actually care about it here, but let's uphold
* the invariant that last-written LSN <= flush LSN.
*/
XLogFlush(lsn);
/*
* Truncating a relation drops all its buffers from the buffer cache
* without calling smgrwrite() on them. But we must account for that in
* our tracking of last-written-LSN all the same: any future smgrnblocks()
* request must return the new size after the truncation. We don't know
* what the LSN of the truncation record was, so be conservative and use
* the most recently inserted WAL record's LSN.
*/
lsn = GetXLogInsertRecPtr();
lsn = nm_adjust_lsn(lsn);
/*
* Flush it, too. We don't actually care about it here, but let's uphold
* the invariant that last-written LSN <= flush LSN.
*/
XLogFlush(lsn);
/*
* Truncate may affect several chunks of relations. So we should either
* update last written LSN for all of them, or update LSN for "dummy"
* metadata block. Second approach seems more efficient. If the relation
* is extended again later, the extension will update the last-written LSN
* for the extended pages, so there's no harm in leaving behind obsolete
* entries for the truncated chunks.
*/
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum);
}
/*
* Truncate may affect several chunks of relations. So we should either
* update last written LSN for all of them, or update LSN for "dummy"
* metadata block. Second approach seems more efficient. If the relation
* is extended again later, the extension will update the last-written LSN
* for the extended pages, so there's no harm in leaving behind obsolete
* entries for the truncated chunks.
*/
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum);
if (debug_compare_local)
{
@@ -2132,8 +1908,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
if (!neon_use_communicator_worker)
communicator_prefetch_pump_state();
communicator_prefetch_pump_state();
if (debug_compare_local)
{
@@ -2319,15 +2094,12 @@ neon_end_unlogged_build(SMgrRelation reln)
nblocks = mdnblocks(reln, MAIN_FORKNUM);
recptr = GetXLogInsertRecPtr();
if (!neon_use_communicator_worker)
{
neon_set_lwlsn_block_range(recptr,
InfoFromNInfoB(rinfob),
MAIN_FORKNUM, 0, nblocks);
neon_set_lwlsn_relation(recptr,
InfoFromNInfoB(rinfob),
MAIN_FORKNUM);
}
neon_set_lwlsn_block_range(recptr,
InfoFromNInfoB(rinfob),
MAIN_FORKNUM, 0, nblocks);
neon_set_lwlsn_relation(recptr,
InfoFromNInfoB(rinfob),
MAIN_FORKNUM);
/* Remove local copy */
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
@@ -2336,15 +2108,8 @@ neon_end_unlogged_build(SMgrRelation reln)
RelFileInfoFmt(InfoFromNInfoB(rinfob)),
forknum);
if (neon_use_communicator_worker)
{
communicator_new_update_cached_rel_size(InfoFromSMgrRel(reln), forknum, nblocks, recptr);
}
else
{
forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
}
forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
mdclose(reln, forknum);
if (!debug_compare_local)
@@ -2412,10 +2177,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
request_lsns.not_modified_since = not_modified_since;
request_lsns.effective_request_lsn = request_lsn;
if (neon_use_communicator_worker)
n_blocks = communicator_new_read_slru_segment(kind, (uint32_t)segno, &request_lsns, path);
else
n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer);
n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer);
return n_blocks;
}
@@ -2452,8 +2214,7 @@ AtEOXact_neon(XactEvent event, void *arg)
}
break;
}
if (!neon_use_communicator_worker)
communicator_reconfigure_timeout_if_needed();
communicator_reconfigure_timeout_if_needed();
}
static const struct f_smgr neon_smgr =
@@ -2511,10 +2272,7 @@ smgr_init_neon(void)
smgr_init_standard();
neon_init();
if (neon_use_communicator_worker)
communicator_new_init();
else
communicator_init();
communicator_init();
}
@@ -2526,16 +2284,6 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
/* This is only used in WAL replay */
Assert(RecoveryInProgress());
if (neon_use_communicator_worker)
{
relsize = communicator_new_rel_nblocks(rinfo, forknum);
if (blkno >= relsize)
communicator_new_rel_zeroextend(rinfo, forknum, relsize, (blkno - relsize) + 1, end_recptr);
return;
}
/* Extend the relation if we know its size */
if (get_cached_relsize(rinfo, forknum, &relsize))
{
@@ -2690,27 +2438,18 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
}
/*
* We don't have the buffer in shared buffers. Check if it's in the LFC.
* If it's not there either, update the lwLsn past this record.
* we don't have the buffer in memory, update lwLsn past this record, also
* evict page from file cache
*/
if (no_redo_needed)
{
bool in_cache;
neon_set_lwlsn_block(end_recptr, rinfo, forknum, blkno);
/*
* Redo changes if the page is present in the LFC.
* Redo changes if page exists in LFC.
* We should perform this check after assigning LwLSN to prevent
* prefetching of some older version of the page by some other backend.
*/
if (neon_use_communicator_worker)
{
in_cache = communicator_new_update_lwlsn_for_block_if_not_cached(rinfo, forknum, blkno, end_recptr);
}
else
{
in_cache = lfc_cache_contains(rinfo, forknum, blkno);
neon_set_lwlsn_block(end_recptr, rinfo, forknum, blkno);
}
no_redo_needed = !in_cache;
no_redo_needed = !lfc_cache_contains(rinfo, forknum, blkno);
}
LWLockRelease(partitionLock);

View File

@@ -87,8 +87,6 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
{
bool found = false;
Assert(!neon_use_communicator_worker);
if (relsize_hash_size > 0)
{
RelTag tag;
@@ -120,8 +118,6 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
void
set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
{
Assert(!neon_use_communicator_worker);
if (relsize_hash_size > 0)
{
RelTag tag;
@@ -170,8 +166,6 @@ set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
void
update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
{
Assert(!neon_use_communicator_worker);
if (relsize_hash_size > 0)
{
RelTag tag;
@@ -206,8 +200,6 @@ update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
void
forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum)
{
Assert(!neon_use_communicator_worker);
if (relsize_hash_size > 0)
{
RelTag tag;

View File

@@ -5,12 +5,17 @@ use std::sync::Arc;
use bytes::Bytes;
use http::Method;
use http::header::{AUTHORIZATION, CONTENT_TYPE, HOST};
use http::header::{
ACCESS_CONTROL_ALLOW_HEADERS, ACCESS_CONTROL_ALLOW_METHODS, ACCESS_CONTROL_ALLOW_ORIGIN,
ACCESS_CONTROL_EXPOSE_HEADERS, ACCESS_CONTROL_MAX_AGE, ACCESS_CONTROL_REQUEST_HEADERS, ALLOW,
AUTHORIZATION, CONTENT_TYPE, HOST, ORIGIN,
};
use http_body_util::combinators::BoxBody;
use http_body_util::{BodyExt, Full};
use http_body_util::{BodyExt, Empty, Full};
use http_utils::error::ApiError;
use hyper::body::Incoming;
use hyper::http::{HeaderName, HeaderValue};
use hyper::http::response::Builder;
use hyper::http::{HeaderMap, HeaderName, HeaderValue};
use hyper::{Request, Response, StatusCode};
use indexmap::IndexMap;
use moka::sync::Cache;
@@ -67,6 +72,15 @@ use crate::util::deserialize_json_string;
static EMPTY_JSON_SCHEMA: &str = r#"{"schemas":[]}"#;
const INTROSPECTION_SQL: &str = POSTGRESQL_INTROSPECTION_SQL;
const HEADER_VALUE_ALLOW_ALL_ORIGINS: HeaderValue = HeaderValue::from_static("*");
// CORS headers values
const ACCESS_CONTROL_ALLOW_METHODS_VALUE: HeaderValue =
HeaderValue::from_static("GET, POST, PATCH, PUT, DELETE, OPTIONS");
const ACCESS_CONTROL_MAX_AGE_VALUE: HeaderValue = HeaderValue::from_static("86400");
const ACCESS_CONTROL_EXPOSE_HEADERS_VALUE: HeaderValue = HeaderValue::from_static(
"Content-Encoding, Content-Location, Content-Range, Content-Type, Date, Location, Server, Transfer-Encoding, Range-Unit",
);
const ACCESS_CONTROL_ALLOW_HEADERS_VALUE: HeaderValue = HeaderValue::from_static("Authorization");
// A wrapper around the DbSchema that allows for self-referencing
#[self_referencing]
@@ -137,6 +151,8 @@ pub struct ApiConfig {
pub role_claim_key: String,
#[serde(default, deserialize_with = "deserialize_comma_separated_option")]
pub db_extra_search_path: Option<Vec<String>>,
#[serde(default, deserialize_with = "deserialize_comma_separated_option")]
pub server_cors_allowed_origins: Option<Vec<String>>,
}
// The DbSchemaCache is a cache of the ApiConfig and DbSchemaOwned for each endpoint
@@ -165,7 +181,13 @@ impl DbSchemaCache {
}
}
pub async fn get_cached_or_remote(
pub fn get_cached(
&self,
endpoint_id: &EndpointCacheKey,
) -> Option<Arc<(ApiConfig, DbSchemaOwned)>> {
count_cache_outcome(CacheKind::Schema, self.0.get(endpoint_id))
}
pub async fn get_remote(
&self,
endpoint_id: &EndpointCacheKey,
auth_header: &HeaderValue,
@@ -174,47 +196,42 @@ impl DbSchemaCache {
ctx: &RequestContext,
config: &'static ProxyConfig,
) -> Result<Arc<(ApiConfig, DbSchemaOwned)>, RestError> {
let cache_result = count_cache_outcome(CacheKind::Schema, self.0.get(endpoint_id));
match cache_result {
Some(v) => Ok(v),
None => {
info!("db_schema cache miss for endpoint: {:?}", endpoint_id);
let remote_value = self
.get_remote(auth_header, connection_string, client, ctx, config)
.await;
let (api_config, schema_owned) = match remote_value {
Ok((api_config, schema_owned)) => (api_config, schema_owned),
Err(e @ RestError::SchemaTooLarge) => {
// for the case where the schema is too large, we cache an empty dummy value
// all the other requests will fail without triggering the introspection query
let schema_owned = serde_json::from_str::<DbSchemaOwned>(EMPTY_JSON_SCHEMA)
.map_err(|e| JsonDeserialize { source: e })?;
info!("db_schema cache miss for endpoint: {:?}", endpoint_id);
let remote_value = self
.internal_get_remote(auth_header, connection_string, client, ctx, config)
.await;
let (api_config, schema_owned) = match remote_value {
Ok((api_config, schema_owned)) => (api_config, schema_owned),
Err(e @ RestError::SchemaTooLarge) => {
// for the case where the schema is too large, we cache an empty dummy value
// all the other requests will fail without triggering the introspection query
let schema_owned = serde_json::from_str::<DbSchemaOwned>(EMPTY_JSON_SCHEMA)
.map_err(|e| JsonDeserialize { source: e })?;
let api_config = ApiConfig {
db_schemas: vec![],
db_anon_role: None,
db_max_rows: None,
db_allowed_select_functions: vec![],
role_claim_key: String::new(),
db_extra_search_path: None,
};
let value = Arc::new((api_config, schema_owned));
count_cache_insert(CacheKind::Schema);
self.0.insert(endpoint_id.clone(), value);
return Err(e);
}
Err(e) => {
return Err(e);
}
let api_config = ApiConfig {
db_schemas: vec![],
db_anon_role: None,
db_max_rows: None,
db_allowed_select_functions: vec![],
role_claim_key: String::new(),
db_extra_search_path: None,
server_cors_allowed_origins: None,
};
let value = Arc::new((api_config, schema_owned));
count_cache_insert(CacheKind::Schema);
self.0.insert(endpoint_id.clone(), value.clone());
Ok(value)
self.0.insert(endpoint_id.clone(), value);
return Err(e);
}
}
Err(e) => {
return Err(e);
}
};
let value = Arc::new((api_config, schema_owned));
count_cache_insert(CacheKind::Schema);
self.0.insert(endpoint_id.clone(), value.clone());
Ok(value)
}
pub async fn get_remote(
async fn internal_get_remote(
&self,
auth_header: &HeaderValue,
connection_string: &str,
@@ -531,7 +548,7 @@ pub(crate) async fn handle(
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, ApiError> {
let result = handle_inner(cancel, config, &ctx, request, backend).await;
let mut response = match result {
let response = match result {
Ok(r) => {
ctx.set_success();
@@ -640,9 +657,6 @@ pub(crate) async fn handle(
}
};
response
.headers_mut()
.insert("Access-Control-Allow-Origin", HeaderValue::from_static("*"));
Ok(response)
}
@@ -722,6 +736,37 @@ async fn handle_inner(
}
}
fn apply_common_cors_headers(
response: &mut Builder,
request_headers: &HeaderMap,
allowed_origins: Option<&Vec<String>>,
) {
let request_origin = request_headers
.get(ORIGIN)
.map(|v| v.to_str().unwrap_or(""));
let response_allow_origin = match (request_origin, allowed_origins) {
(Some(or), Some(allowed_origins)) => {
if allowed_origins.iter().any(|o| o == or) {
Some(HeaderValue::from_str(or).unwrap_or(HEADER_VALUE_ALLOW_ALL_ORIGINS))
} else {
None
}
}
(Some(_), None) => Some(HEADER_VALUE_ALLOW_ALL_ORIGINS),
_ => None,
};
if let Some(h) = response.headers_mut() {
h.insert(
ACCESS_CONTROL_EXPOSE_HEADERS,
ACCESS_CONTROL_EXPOSE_HEADERS_VALUE,
);
if let Some(origin) = response_allow_origin {
h.insert(ACCESS_CONTROL_ALLOW_ORIGIN, origin);
}
}
}
#[allow(clippy::too_many_arguments)]
async fn handle_rest_inner(
config: &'static ProxyConfig,
@@ -733,12 +778,6 @@ async fn handle_rest_inner(
jwt: String,
backend: Arc<PoolingBackend>,
) -> Result<Response<BoxBody<Bytes, hyper::Error>>, RestError> {
// validate the jwt token
let jwt_parsed = backend
.authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
.await
.map_err(HttpConnError::from)?;
let db_schema_cache =
config
.rest_config
@@ -754,28 +793,83 @@ async fn handle_rest_inner(
message: "Failed to get endpoint cache key".to_string(),
}))?;
let mut client = backend.connect_to_local_proxy(ctx, conn_info).await?;
let (parts, originial_body) = request.into_parts();
// try and get the cached entry for this endpoint
// it contains the api config and the introspected db schema
let cached_entry = db_schema_cache.get_cached(&endpoint_cache_key);
let allowed_origins = cached_entry
.as_ref()
.and_then(|arc| arc.0.server_cors_allowed_origins.as_ref());
let mut response = Response::builder();
apply_common_cors_headers(&mut response, &parts.headers, allowed_origins);
// handle the OPTIONS request
if parts.method == Method::OPTIONS {
let allowed_headers = parts
.headers
.get(ACCESS_CONTROL_REQUEST_HEADERS)
.and_then(|a| a.to_str().ok())
.filter(|v| !v.is_empty())
.map_or_else(
|| "Authorization".to_string(),
|v| format!("{v}, Authorization"),
);
return response
.status(StatusCode::OK)
.header(
ACCESS_CONTROL_ALLOW_METHODS,
ACCESS_CONTROL_ALLOW_METHODS_VALUE,
)
.header(ACCESS_CONTROL_MAX_AGE, ACCESS_CONTROL_MAX_AGE_VALUE)
.header(
ACCESS_CONTROL_ALLOW_HEADERS,
HeaderValue::from_str(&allowed_headers)
.unwrap_or(ACCESS_CONTROL_ALLOW_HEADERS_VALUE),
)
.header(ALLOW, ACCESS_CONTROL_ALLOW_METHODS_VALUE)
.body(Empty::new().map_err(|x| match x {}).boxed())
.map_err(|e| {
RestError::SubzeroCore(InternalError {
message: e.to_string(),
})
});
}
// validate the jwt token
let jwt_parsed = backend
.authenticate_with_jwt(ctx, &conn_info.user_info, jwt)
.await
.map_err(HttpConnError::from)?;
let auth_header = parts
.headers
.get(AUTHORIZATION)
.ok_or(RestError::SubzeroCore(InternalError {
message: "Authorization header is required".to_string(),
}))?;
let mut client = backend.connect_to_local_proxy(ctx, conn_info).await?;
let entry = db_schema_cache
.get_cached_or_remote(
&endpoint_cache_key,
auth_header,
connection_string,
&mut client,
ctx,
config,
)
.await?;
let entry = match cached_entry {
Some(e) => e,
None => {
// if not cached, get the remote entry (will run the introspection query)
db_schema_cache
.get_remote(
&endpoint_cache_key,
auth_header,
connection_string,
&mut client,
ctx,
config,
)
.await?
}
};
let (api_config, db_schema_owned) = entry.as_ref();
let db_schema = db_schema_owned.borrow_schema();
let db_schemas = &api_config.db_schemas; // list of schemas available for the api
@@ -999,8 +1093,8 @@ async fn handle_rest_inner(
let _metrics = client.metrics(ctx); // FIXME: is everything in the context set correctly?
// send the request to the local proxy
let response = make_raw_local_proxy_request(&mut client, headers, req_body).await?;
let (parts, body) = response.into_parts();
let proxy_response = make_raw_local_proxy_request(&mut client, headers, req_body).await?;
let (response_parts, body) = proxy_response.into_parts();
let max_response = config.http_config.max_response_size_bytes;
let bytes = read_body_with_limit(body, max_response)
@@ -1009,7 +1103,7 @@ async fn handle_rest_inner(
// if the response status is greater than 399, then it is an error
// FIXME: check if there are other error codes or shapes of the response
if parts.status.as_u16() > 399 {
if response_parts.status.as_u16() > 399 {
// turn this postgres error from the json into PostgresError
let postgres_error = serde_json::from_slice(&bytes)
.map_err(|e| RestError::SubzeroCore(JsonDeserialize { source: e }))?;
@@ -1175,7 +1269,7 @@ async fn handle_rest_inner(
.boxed();
// build the response
let mut response = Response::builder()
response = response
.status(StatusCode::from_u16(status).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR))
.header(CONTENT_TYPE, http_content_type);

View File

@@ -510,7 +510,6 @@ impl ApiMethod for ComputeHookTenant {
tracing::info!("Reconfiguring pageservers for endpoint {endpoint_name}");
let shard_count = match shards.len() {
0 => panic!("no shards"),
1 => ShardCount::unsharded(),
n => ShardCount(n.try_into().expect("too many shards")),
};

View File

@@ -644,6 +644,7 @@ async fn handle_tenant_timeline_safekeeper_migrate(
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
// TODO(diko): it's not PS operation, there should be a different permission scope.
check_permissions(&req, Scope::PageServerApi)?;
maybe_rate_limit(&req, tenant_id).await;
@@ -665,6 +666,23 @@ async fn handle_tenant_timeline_safekeeper_migrate(
json_response(StatusCode::OK, ())
}
async fn handle_tenant_timeline_safekeeper_migrate_abort(
service: Arc<Service>,
req: Request<Body>,
) -> Result<Response<Body>, ApiError> {
let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
// TODO(diko): it's not PS operation, there should be a different permission scope.
check_permissions(&req, Scope::PageServerApi)?;
maybe_rate_limit(&req, tenant_id).await;
service
.tenant_timeline_safekeeper_migrate_abort(tenant_id, timeline_id)
.await?;
json_response(StatusCode::OK, ())
}
async fn handle_tenant_timeline_lsn_lease(
service: Arc<Service>,
req: Request<Body>,
@@ -2611,6 +2629,16 @@ pub fn make_router(
)
},
)
.post(
"/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate_abort",
|r| {
tenant_service_handler(
r,
handle_tenant_timeline_safekeeper_migrate_abort,
RequestName("v1_tenant_timeline_safekeeper_migrate_abort"),
)
},
)
// LSN lease passthrough to all shards
.post(
"/v1/tenant/:tenant_id/timeline/:timeline_id/lsn_lease",

View File

@@ -1230,10 +1230,7 @@ impl Service {
}
// It it is the same new_sk_set, we can continue the migration (retry).
} else {
let prev_finished = timeline.cplane_notified_generation == timeline.generation
&& timeline.sk_set_notified_generation == timeline.generation;
if !prev_finished {
if !is_migration_finished(&timeline) {
// The previous migration is committed, but the finish step failed.
// Safekeepers/cplane might not know about the last membership configuration.
// Retry the finish step to ensure smooth migration.
@@ -1545,6 +1542,8 @@ impl Service {
timeline_id: TimelineId,
timeline: &TimelinePersistence,
) -> Result<(), ApiError> {
tracing::info!(generation=?timeline.generation, sk_set=?timeline.sk_set, new_sk_set=?timeline.new_sk_set, "retrying finish safekeeper migration");
if timeline.new_sk_set.is_some() {
// Logical error, should never happen.
return Err(ApiError::InternalServerError(anyhow::anyhow!(
@@ -1624,4 +1623,120 @@ impl Service {
Ok(wal_positions[quorum_size - 1])
}
/// Abort ongoing safekeeper migration.
pub(crate) async fn tenant_timeline_safekeeper_migrate_abort(
self: &Arc<Self>,
tenant_id: TenantId,
timeline_id: TimelineId,
) -> Result<(), ApiError> {
// TODO(diko): per-tenant lock is too wide. Consider introducing per-timeline locks.
let _tenant_lock = trace_shared_lock(
&self.tenant_op_locks,
tenant_id,
TenantOperations::TimelineSafekeeperMigrate,
)
.await;
// Fetch current timeline configuration from the configuration storage.
let timeline = self
.persistence
.get_timeline(tenant_id, timeline_id)
.await?;
let Some(timeline) = timeline else {
return Err(ApiError::NotFound(
anyhow::anyhow!(
"timeline {tenant_id}/{timeline_id} doesn't exist in timelines table"
)
.into(),
));
};
let mut generation = SafekeeperGeneration::new(timeline.generation as u32);
let Some(new_sk_set) = &timeline.new_sk_set else {
// No new_sk_set -> no active migration that we can abort.
tracing::info!("timeline has no active migration");
if !is_migration_finished(&timeline) {
// The last migration is committed, but the finish step failed.
// Safekeepers/cplane might not know about the last membership configuration.
// Retry the finish step to make the timeline state clean.
self.finish_safekeeper_migration_retry(tenant_id, timeline_id, &timeline)
.await?;
}
return Ok(());
};
tracing::info!(sk_set=?timeline.sk_set, ?new_sk_set, ?generation, "aborting timeline migration");
let cur_safekeepers = self.get_safekeepers(&timeline.sk_set)?;
let new_safekeepers = self.get_safekeepers(new_sk_set)?;
let cur_sk_member_set =
Self::make_member_set(&cur_safekeepers).map_err(ApiError::InternalServerError)?;
// Increment current generation and remove new_sk_set from the timeline to abort the migration.
generation = generation.next();
let mconf = membership::Configuration {
generation,
members: cur_sk_member_set,
new_members: None,
};
// Exclude safekeepers which were added during the current migration.
let cur_ids: HashSet<NodeId> = cur_safekeepers.iter().map(|sk| sk.get_id()).collect();
let exclude_safekeepers = new_safekeepers
.into_iter()
.filter(|sk| !cur_ids.contains(&sk.get_id()))
.collect::<Vec<_>>();
let exclude_requests = exclude_safekeepers
.iter()
.map(|sk| TimelinePendingOpPersistence {
sk_id: sk.skp.id,
tenant_id: tenant_id.to_string(),
timeline_id: timeline_id.to_string(),
generation: generation.into_inner() as i32,
op_kind: SafekeeperTimelineOpKind::Exclude,
})
.collect::<Vec<_>>();
let cur_sk_set = cur_safekeepers
.iter()
.map(|sk| sk.get_id())
.collect::<Vec<_>>();
// Persist new mconf and exclude requests.
self.persistence
.update_timeline_membership(
tenant_id,
timeline_id,
generation,
&cur_sk_set,
None,
&exclude_requests,
)
.await?;
// At this point we have already commited the abort, but still need to notify
// cplane/safekeepers with the new mconf. That's what finish_safekeeper_migration does.
self.finish_safekeeper_migration(
tenant_id,
timeline_id,
&cur_safekeepers,
&mconf,
&exclude_safekeepers,
)
.await?;
Ok(())
}
}
fn is_migration_finished(timeline: &TimelinePersistence) -> bool {
timeline.cplane_notified_generation == timeline.generation
&& timeline.sk_set_notified_generation == timeline.generation
}

View File

@@ -530,10 +530,7 @@ class NeonLocalCli(AbstractNeonCli):
args.extend(["--external-http-port", str(external_http_port)])
if internal_http_port is not None:
args.extend(["--internal-http-port", str(internal_http_port)])
# XXX: By checking for None, we enable the new communicator for all tests
# by default
if grpc or grpc is None:
if grpc:
args.append("--grpc")
if endpoint_id is not None:
args.append(endpoint_id)

View File

@@ -2323,6 +2323,19 @@ class NeonStorageController(MetricsGetter, LogUtils):
response.raise_for_status()
log.info(f"migrate_safekeepers success: {response.json()}")
def abort_safekeeper_migration(
self,
tenant_id: TenantId,
timeline_id: TimelineId,
):
response = self.request(
"POST",
f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate_abort",
headers=self.headers(TokenScope.PAGE_SERVER_API),
)
response.raise_for_status()
log.info(f"abort_safekeeper_migration success: {response.json()}")
def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]:
"""
:return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int}
@@ -4778,7 +4791,7 @@ class Endpoint(PgProtocol, LogUtils):
# set small 'max_replication_write_lag' to enable backpressure
# and make tests more stable.
config_lines += ["max_replication_write_lag=15MB"]
config_lines = ["max_replication_write_lag=15MB"] + config_lines
# Delete file cache if it exists (and we're recreating the endpoint)
if USE_LFC:

View File

@@ -90,8 +90,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
# During shutdown, DownloadError::Cancelled may be logged as an error. Cleaning this
# up is tracked in https://github.com/neondatabase/neon/issues/6096
".*Cancelled, shutting down.*",
# gRPC request failures during shutdown.
".*grpc:pageservice.*request failed with Unavailable: timeline is shutting down.*",
# Open layers are only rolled at Lsn boundaries to avoid name clashses.
# Hence, we can overshoot the soft limit set by checkpoint distance.
# This is especially pronounced in tests that set small checkpoint

View File

@@ -157,7 +157,6 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
[
".*request{method=POST path=/v1/tenant/.*/timeline request_id=.*}: request was dropped before completing.*",
".*page_service_conn_main.*: query handler for 'basebackup .* ERROR: Not found: Timeline",
".*request failed with Unavailable: Timeline .* is not active",
]
)
ps_http = env.pageserver.http_client()
@@ -195,10 +194,7 @@ def test_cannot_create_endpoint_on_non_uploaded_timeline(neon_env_builder: NeonE
env.neon_cli.mappings_map_branch(initial_branch, env.initial_tenant, env.initial_timeline)
with pytest.raises(
RuntimeError,
match=f"Timeline {env.initial_tenant}/{env.initial_timeline} is not active",
):
with pytest.raises(RuntimeError, match="ERROR: Not found: Timeline"):
env.endpoints.create_start(
initial_branch, tenant_id=env.initial_tenant, basebackup_request_tries=2
)

View File

@@ -101,37 +101,20 @@ def check_prewarmed_contains(
@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
@pytest.mark.parametrize("grpc", [True, False])
@pytest.mark.parametrize("method", METHOD_VALUES, ids=METHOD_IDS)
def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod, grpc: bool):
def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
"""
Test we can offload endpoint's LFC cache to endpoint storage.
Test we can prewarm endpoint with LFC cache loaded from endpoint storage.
"""
env = neon_simple_env
n_records = 1000000
# The `neon.file_cache_prewarm_limit` GUC sets the max number of *chunks* to
# load. So the number of *pages* loaded depends on the chunk size. With the
# new communicator, the new LFC implementation doesn't do chunking so the
# limit is the number of pages, while with the old implementation, the
# default chunk size 1 MB chunks.
#
# Therefore with the old implementation, 1000 chunks equals 128000 pages, if
# all the chunks are fully dense. In practice they are sparse, but should
# amount to > 10000 pages anyway. (We have an assertion below that at least
# 10000 LFC pages are in use after prewarming)
if grpc:
prewarm_limit = 15000
else:
prewarm_limit = 1000
cfg = [
"autovacuum = off",
"shared_buffers=1MB",
"neon.max_file_cache_size=1GB",
"neon.file_cache_size_limit=1GB",
f"neon.file_cache_prewarm_limit={prewarm_limit}",
"neon.file_cache_prewarm_limit=1000",
]
if method == PrewarmMethod.AUTOPREWARM:
@@ -140,10 +123,9 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod, grpc: bool
config_lines=cfg,
autoprewarm=True,
offload_lfc_interval_seconds=AUTOOFFLOAD_INTERVAL_SECS,
grpc=grpc,
)
else:
endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg, grpc=grpc)
endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)
pg_conn = endpoint.connect()
pg_cur = pg_conn.cursor()
@@ -180,7 +162,7 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod, grpc: bool
log.info(f"Used LFC size: {lfc_used_pages}")
pg_cur.execute("select * from neon.get_prewarm_info()")
total, prewarmed, skipped, _ = pg_cur.fetchall()[0]
assert lfc_used_pages >= 10000
assert lfc_used_pages > 10000
assert total > 0
assert prewarmed > 0
assert total == prewarmed + skipped
@@ -204,7 +186,7 @@ def test_lfc_prewarm_cancel(neon_simple_env: NeonEnv):
"shared_buffers=1MB",
"neon.max_file_cache_size=1GB",
"neon.file_cache_size_limit=1GB",
"neon.file_cache_prewarm_limit=2000000",
"neon.file_cache_prewarm_limit=1000",
]
endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)

View File

@@ -35,23 +35,8 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
n_resize = 10
scale = 20
def get_lfc_size() -> tuple[int, int]:
lfc_file_path = endpoint.lfc_path()
lfc_file_size = lfc_file_path.stat().st_size
res = subprocess.run(
["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True
)
lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0]
log.info(f"Size of LFC file {lfc_file_size / (1024 * 1024)}MB, alloc'd KB {lfc_file_blocks}, blocks {int(lfc_file_blocks)/8}")
return (lfc_file_size, lfc_file_blocks)
log.info("Original LFC size")
get_lfc_size()
def run_pgbench(connstr: str):
log.info(f"Start a pgbench workload on pg {connstr}")
get_lfc_size()
pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
pg_bin.run_capture(["pgbench", "-c10", f"-T{n_resize}", "-Mprepared", "-S", connstr])
@@ -65,7 +50,18 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
cur = conn.cursor()
cur.execute("create extension neon")
def get_lfc_size() -> tuple[int, int]:
lfc_file_path = endpoint.lfc_path()
lfc_file_size = lfc_file_path.stat().st_size
res = subprocess.run(
["ls", "-sk", lfc_file_path], check=True, text=True, capture_output=True
)
lfc_file_blocks = re.findall("([0-9A-F]+)", res.stdout)[0]
log.info(f"Size of LFC file {lfc_file_size}, blocks {lfc_file_blocks}")
return (lfc_file_size, lfc_file_blocks)
# For as long as pgbench is running, twiddle the LFC size once a second.
# Note that we launch this immediately, already while the "pgbench -i"
# initialization step is still running. That's quite a different workload
@@ -76,14 +72,11 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
# is really doing something.
size = random.randint(192, 512)
cur.execute(f"alter system set neon.file_cache_size_limit='{size}MB'")
log.info(f"alter system set neon.file_cache_size_limit='{size}MB'")
cur.execute("select pg_reload_conf()")
time.sleep(1)
get_lfc_size()
thread.join()
log.info("Running seqscan.")
# Fill LFC: seqscan should fetch the whole table in cache.
# It is needed for further correct evaluation of LFC file size
# (a sparse chunk of LFC takes less than 1 MB on disk).
@@ -93,13 +86,6 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
(lfc_file_size, lfc_file_blocks) = get_lfc_size()
assert int(lfc_file_blocks) > 128 * 1024
time.sleep(2)
cur.execute("select count(*) from local_cache")
log.info(f"local_cache size: {cur.fetchall()[0][0]}")
log.info("Beginning actual shrink.")
# At the end, set it at 100 MB, and perform a final check that the disk usage
# of the file is in that ballbark.
#
@@ -138,4 +124,4 @@ def test_lfc_resize(neon_simple_env: NeonEnv, pg_bin: PgBin):
nretries = nretries - 1
time.sleep(1)
# assert local_cache_size == used_pages
assert local_cache_size == used_pages

View File

@@ -17,9 +17,7 @@ def check_tenant(
config_lines = [
f"neon.safekeeper_proto_version = {safekeeper_proto_version}",
]
endpoint = env.endpoints.create_start(
"main", tenant_id=tenant_id, config_lines=config_lines, grpc=True
)
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id, config_lines=config_lines)
# we rely upon autocommit after each statement
res_1 = endpoint.safe_psql_many(
queries=[

View File

@@ -145,6 +145,7 @@ def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):
stop_and_check_lsn(secondary, None)
if method == PromoteMethod.COMPUTE_CTL:
log.info("Restarting primary to check new config")
secondary.stop()
# In production, compute ultimately receives new compute spec from cplane.
secondary.respec(mode="Primary")

View File

@@ -460,3 +460,91 @@ def test_pull_from_most_advanced_sk(neon_env_builder: NeonEnvBuilder):
ep.start(safekeeper_generation=5, safekeepers=new_sk_set2)
assert ep.safe_psql("SELECT * FROM t") == [(0,), (1,)]
def test_abort_safekeeper_migration(neon_env_builder: NeonEnvBuilder):
"""
Test that safekeeper migration can be aborted.
1. Insert failpoints and ensure the abort successfully reverts the timeline state.
2. Check that endpoint is operational after the abort.
"""
neon_env_builder.num_safekeepers = 2
neon_env_builder.storage_controller_config = {
"timelines_onto_safekeepers": True,
"timeline_safekeeper_count": 1,
}
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.extend(PAGESERVER_ALLOWED_ERRORS)
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
assert len(mconf["sk_set"]) == 1
cur_sk = mconf["sk_set"][0]
cur_gen = 1
ep = env.endpoints.create("main", tenant_id=env.initial_tenant)
ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"])
ep.safe_psql("CREATE EXTENSION neon_test_utils;")
ep.safe_psql("CREATE TABLE t(a int)")
ep.safe_psql("INSERT INTO t VALUES (1)")
another_sk = [sk.id for sk in env.safekeepers if sk.id != cur_sk][0]
failpoints = [
"sk-migration-after-step-3",
"sk-migration-after-step-4",
"sk-migration-after-step-5",
"sk-migration-after-step-7",
]
for fp in failpoints:
env.storage_controller.configure_failpoints((fp, "return(1)"))
with pytest.raises(StorageControllerApiException, match=f"failpoint {fp}"):
env.storage_controller.migrate_safekeepers(
env.initial_tenant, env.initial_timeline, [another_sk]
)
cur_gen += 1
env.storage_controller.configure_failpoints((fp, "off"))
# We should have a joint mconf after the failure.
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
assert mconf["generation"] == cur_gen
assert mconf["sk_set"] == [cur_sk]
assert mconf["new_sk_set"] == [another_sk]
env.storage_controller.abort_safekeeper_migration(env.initial_tenant, env.initial_timeline)
cur_gen += 1
# Abort should revert the timeline to the previous sk_set and increment the generation.
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
assert mconf["generation"] == cur_gen
assert mconf["sk_set"] == [cur_sk]
assert mconf["new_sk_set"] is None
assert ep.safe_psql("SHOW neon.safekeepers")[0][0].startswith(f"g#{cur_gen}:")
ep.safe_psql(f"INSERT INTO t VALUES ({cur_gen})")
# After step-8 the final mconf is committed and the migration is not abortable anymore.
# So the abort should not abort anything.
env.storage_controller.configure_failpoints(("sk-migration-after-step-8", "return(1)"))
with pytest.raises(StorageControllerApiException, match="failpoint sk-migration-after-step-8"):
env.storage_controller.migrate_safekeepers(
env.initial_tenant, env.initial_timeline, [another_sk]
)
cur_gen += 2
env.storage_controller.configure_failpoints((fp, "off"))
env.storage_controller.abort_safekeeper_migration(env.initial_tenant, env.initial_timeline)
# The migration is fully committed, no abort should have been performed.
mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
assert mconf["generation"] == cur_gen
assert mconf["sk_set"] == [another_sk]
assert mconf["new_sk_set"] is None
ep.safe_psql(f"INSERT INTO t VALUES ({cur_gen})")
ep.clear_buffers()
assert ep.safe_psql("SELECT * FROM t") == [(i + 1,) for i in range(cur_gen) if i % 2 == 0]

View File

@@ -28,8 +28,8 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
clap = { version = "4", features = ["derive", "env", "string"] }
clap_builder = { version = "4", default-features = false, features = ["color", "env", "help", "std", "string", "suggestions", "usage"] }
const-oid = { version = "0.9", default-features = false, features = ["db", "std"] }
criterion = { version = "0.5", features = ["html_reports"] }
crossbeam-epoch = { version = "0.9" }
crossbeam-utils = { version = "0.8" }
crypto-bigint = { version = "0.5", features = ["generic-array", "zeroize"] }
der = { version = "0.7", default-features = false, features = ["derive", "flagset", "oid", "pem", "std"] }
deranged = { version = "0.3", default-features = false, features = ["powerfmt", "serde", "std"] }
@@ -72,6 +72,7 @@ num-integer = { version = "0.1", features = ["i128"] }
num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] }
num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] }
num-traits = { version = "0.2", features = ["i128", "libm"] }
once_cell = { version = "1" }
p256 = { version = "0.13", features = ["jwk"] }
parquet = { version = "53", default-features = false, features = ["zstd"] }
portable-atomic = { version = "1", features = ["require-cas"] }
@@ -104,7 +105,7 @@ tokio-rustls = { version = "0.26", default-features = false, features = ["loggin
tokio-stream = { version = "0.1", features = ["net", "sync"] }
tokio-util = { version = "0.7", features = ["codec", "compat", "io-util", "rt"] }
toml_edit = { version = "0.22", features = ["serde"] }
tonic = { version = "0.13", default-features = false, features = ["codegen", "gzip", "prost", "router", "tls-native-roots", "tls-ring", "transport", "zstd"] }
tonic = { version = "0.13", default-features = false, features = ["codegen", "gzip", "prost", "router", "server", "tls-native-roots", "tls-ring", "zstd"] }
tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "log"] }
tracing = { version = "0.1", features = ["log"] }
tracing-core = { version = "0.1" }
@@ -142,6 +143,7 @@ num-integer = { version = "0.1", features = ["i128"] }
num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] }
num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] }
num-traits = { version = "0.2", features = ["i128", "libm"] }
once_cell = { version = "1" }
parquet = { version = "53", default-features = false, features = ["zstd"] }
prettyplease = { version = "0.2", default-features = false, features = ["verbatim"] }
proc-macro2 = { version = "1" }