Compare commits

..

2 Commits

Author SHA1 Message Date
Suhas Thalanki
e62ff8cc30 Merge branch 'main' into thesuhas/migrate_config.rs_hadron 2025-07-22 11:36:11 -05:00
Suhas Thalanki
582752db4e added config.rs changes from hadron 2025-07-21 16:43:59 -05:00
67 changed files with 1150 additions and 8017 deletions

1
.gitignore vendored
View File

@@ -15,7 +15,6 @@ neon.iml
/.neon
/integration_tests/.neon
compaction-suite-results.*
pgxn/neon/communicator/communicator_bindings.h
docker-compose/docker-compose-parallel.yml
# Coverage

109
Cargo.lock generated
View File

@@ -259,17 +259,6 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8ab6b55fe97976e46f91ddbed8d147d966475dc29b2032757ba47e02376fbc3"
[[package]]
name = "atomic_enum"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99e1aca718ea7b89985790c94aad72d77533063fe00bc497bb79a7c2dae6a661"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.100",
]
[[package]]
name = "autocfg"
version = "1.1.0"
@@ -1305,32 +1294,15 @@ dependencies = [
[[package]]
name = "communicator"
version = "0.0.0"
version = "0.1.0"
dependencies = [
"atomic_enum",
"axum",
"bytes",
"cbindgen",
"clashmap",
"http 1.3.1",
"libc",
"measured",
"metrics",
"neon-shmem",
"nix 0.30.1",
"pageserver_api",
"pageserver_client_grpc",
"pageserver_page_api",
"prometheus",
"prost 0.13.5",
"strum_macros",
"thiserror 1.0.69",
"tokio",
"tokio-pipe",
"tonic",
"tracing",
"tracing-subscriber",
"uring-common",
"utils",
"workspace_hack",
]
@@ -1670,9 +1642,9 @@ dependencies = [
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
version = "0.8.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
[[package]]
name = "crossterm"
@@ -2388,12 +2360,6 @@ version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "foldhash"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
[[package]]
name = "form_urlencoded"
version = "1.2.1"
@@ -2761,16 +2727,6 @@ version = "0.15.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
[[package]]
name = "hashbrown"
version = "0.15.4"
source = "git+https://github.com/quantumish/hashbrown.git?rev=6610e6d#6610e6d2b1f288ef7b0709a3efefbc846395dc5e"
dependencies = [
"allocator-api2",
"equivalent",
"foldhash",
]
[[package]]
name = "hashlink"
version = "0.9.1"
@@ -3838,7 +3794,7 @@ dependencies = [
"prometheus",
"rand 0.9.1",
"rand_distr",
"twox-hash 1.6.3",
"twox-hash",
]
[[package]]
@@ -3925,21 +3881,15 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
name = "neon-shmem"
version = "0.1.0"
dependencies = [
"ahash",
"criterion",
"hashbrown 0.15.4",
"libc",
"lock_api",
"nix 0.30.1",
"rand 0.9.1",
"rand_distr",
"rustc-hash 2.1.1",
"seahash",
"tempfile",
"thiserror 1.0.69",
"twox-hash 2.1.1",
"workspace_hack",
"xxhash-rust",
]
[[package]]
@@ -4394,16 +4344,13 @@ version = "0.1.0"
dependencies = [
"anyhow",
"async-trait",
"axum",
"bytes",
"camino",
"clap",
"futures",
"hdrhistogram",
"http 1.3.1",
"humantime",
"humantime-serde",
"metrics",
"pageserver_api",
"pageserver_client",
"pageserver_client_grpc",
@@ -4493,7 +4440,6 @@ dependencies = [
"pageserver_client",
"pageserver_compaction",
"pageserver_page_api",
"peekable",
"pem",
"pin-project-lite",
"postgres-protocol",
@@ -4507,7 +4453,6 @@ dependencies = [
"pprof",
"pq_proto",
"procfs",
"prost 0.13.5",
"rand 0.9.1",
"range-set-blaze",
"regex",
@@ -4544,7 +4489,7 @@ dependencies = [
"tower 0.5.2",
"tracing",
"tracing-utils",
"twox-hash 1.6.3",
"twox-hash",
"url",
"utils",
"uuid",
@@ -4756,7 +4701,7 @@ dependencies = [
"paste",
"seq-macro",
"thrift",
"twox-hash 1.6.3",
"twox-hash",
"zstd",
"zstd-sys",
]
@@ -4802,15 +4747,6 @@ dependencies = [
"sha2",
]
[[package]]
name = "peekable"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "225f9651e475709164f871dc2f5724956be59cb9edb055372ffeeab01ec2d20b"
dependencies = [
"smallvec",
]
[[package]]
name = "pem"
version = "3.0.3"
@@ -6505,12 +6441,6 @@ version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "621e3680f3e07db4c9c2c3fb07c6223ab2fab2e54bd3c04c3ae037990f428c32"
[[package]]
name = "seahash"
version = "4.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1c107b6f4780854c8b126e228ea8869f4d7b71260f962fefb57b996b8959ba6b"
[[package]]
name = "sec1"
version = "0.3.0"
@@ -7658,16 +7588,6 @@ dependencies = [
"syn 2.0.100",
]
[[package]]
name = "tokio-pipe"
version = "0.2.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f213a84bffbd61b8fa0ba8a044b4bbe35d471d0b518867181e82bd5c15542784"
dependencies = [
"libc",
"tokio",
]
[[package]]
name = "tokio-postgres"
version = "0.7.10"
@@ -8204,15 +8124,6 @@ dependencies = [
"static_assertions",
]
[[package]]
name = "twox-hash"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b907da542cbced5261bd3256de1b3a1bf340a3d37f93425a07362a1d687de56"
dependencies = [
"rand 0.9.1",
]
[[package]]
name = "typed-json"
version = "0.1.1"
@@ -8932,7 +8843,6 @@ dependencies = [
"clap",
"clap_builder",
"const-oid",
"criterion",
"crypto-bigint 0.5.5",
"der 0.7.8",
"deranged",
@@ -8975,6 +8885,7 @@ dependencies = [
"num-iter",
"num-rational",
"num-traits",
"once_cell",
"p256 0.13.2",
"parquet",
"prettyplease",
@@ -9082,12 +8993,6 @@ version = "0.13.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d25c75bf9ea12c4040a97f829154768bbbce366287e2dc044af160cd79a13fd"
[[package]]
name = "xxhash-rust"
version = "0.8.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3"
[[package]]
name = "yansi"
version = "1.0.1"

View File

@@ -93,7 +93,6 @@ clap = { version = "4.0", features = ["derive", "env"] }
clashmap = { version = "1.0", features = ["raw-api"] }
comfy-table = "7.1"
const_format = "0.2"
crossbeam-utils = "0.8.21"
crc32c = "0.6"
diatomic-waker = { version = "0.2.3" }
either = "1.8"
@@ -146,14 +145,13 @@ num-traits = "0.2.19"
once_cell = "1.13"
opentelemetry = "0.30"
opentelemetry_sdk = "0.30"
opentelemetry-otlp = { version = "0.30", default-features = false, features = ["http-proto", "trace", "http", "reqwest-blocking-client"] }
opentelemetry-otlp = { version = "0.30", default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions = "0.30"
parking_lot = "0.12"
parquet = { version = "53", default-features = false, features = ["zstd"] }
parquet_derive = "53"
pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
pem = "3.0.3"
peekable = "0.3.0"
pin-project-lite = "0.2"
pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointer", "prost-codec"] }
procfs = "0.16"
@@ -192,7 +190,6 @@ smallvec = "1.11"
smol_str = { version = "0.2.0", features = ["serde"] }
socket2 = "0.5"
spki = "0.7.3"
spin = "0.9.8"
strum = "0.26"
strum_macros = "0.26"
"subtle" = "2.5.0"
@@ -204,6 +201,7 @@ thiserror = "1.0"
tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
tokio = { version = "1.43.1", features = ["macros"] }
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
tokio-io-timeout = "1.2.0"
tokio-postgres-rustls = "0.12.0"
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
@@ -242,9 +240,6 @@ x509-cert = { version = "0.2.5" }
env_logger = "0.11"
log = "0.4"
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
uring-common = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }

View File

@@ -6,8 +6,7 @@ use compute_api::responses::{
LfcPrewarmState, PromoteState, TlsConfig,
};
use compute_api::spec::{
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverConnectionInfo,
PageserverProtocol, PageserverShardConnectionInfo, PageserverShardInfo, PgIdent,
ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
};
use futures::StreamExt;
use futures::future::join_all;
@@ -241,7 +240,7 @@ pub struct ParsedSpec {
pub spec: ComputeSpec,
pub tenant_id: TenantId,
pub timeline_id: TimelineId,
pub pageserver_conninfo: PageserverConnectionInfo,
pub pageserver_connstr: String,
pub safekeeper_connstrings: Vec<String>,
pub storage_auth_token: Option<String>,
/// k8s dns name and port
@@ -288,114 +287,26 @@ impl ParsedSpec {
}
}
/// Extract PageserverConnectionInfo from a comma-separated list of libpq connection strings.
///
/// This is used for backwards-compatilibity, to parse the legacye `pageserver_connstr`
/// field in the compute spec, or the 'neon.pageserver_connstring' GUC. Nowadays, the
/// 'pageserver_connection_info' field should be used instead.
fn extract_pageserver_conninfo_from_connstr(
connstr: &str,
stripe_size: Option<u32>,
) -> Result<PageserverConnectionInfo, anyhow::Error> {
let shard_infos: Vec<_> = connstr
.split(',')
.map(|connstr| PageserverShardInfo {
pageservers: vec![PageserverShardConnectionInfo {
id: None,
libpq_url: Some(connstr.to_string()),
grpc_url: None,
}],
})
.collect();
match shard_infos.len() {
0 => anyhow::bail!("empty connection string"),
1 => {
// We assume that if there's only connection string, it means "unsharded",
// rather than a sharded system with just a single shard. The latter is
// possible in principle, but we never do it.
let shard_count = ShardCount::unsharded();
let only_shard = shard_infos.first().unwrap().clone();
let shards = vec![(ShardIndex::unsharded(), only_shard)];
Ok(PageserverConnectionInfo {
shard_count,
stripe_size: None,
shards: shards.into_iter().collect(),
prefer_protocol: PageserverProtocol::Libpq,
})
}
n => {
if stripe_size.is_none() {
anyhow::bail!("{n} shards but no stripe_size");
}
let shard_count = ShardCount(n.try_into()?);
let shards = shard_infos
.into_iter()
.enumerate()
.map(|(idx, shard_info)| {
(
ShardIndex {
shard_count,
shard_number: ShardNumber(
idx.try_into().expect("shard number fits in u8"),
),
},
shard_info,
)
})
.collect();
Ok(PageserverConnectionInfo {
shard_count,
stripe_size,
shards,
prefer_protocol: PageserverProtocol::Libpq,
})
}
}
}
impl TryFrom<ComputeSpec> for ParsedSpec {
type Error = anyhow::Error;
fn try_from(spec: ComputeSpec) -> Result<Self, anyhow::Error> {
type Error = String;
fn try_from(spec: ComputeSpec) -> Result<Self, String> {
// Extract the options from the spec file that are needed to connect to
// the storage system.
//
// In compute specs generated by old control plane versions, the spec file might
// be missing the `pageserver_connection_info` field. In that case, we need to dig
// the pageserver connection info from the `pageserver_connstr` field instead, or
// if that's missing too, from the GUC in the cluster.settings field.
let mut pageserver_conninfo = spec.pageserver_connection_info.clone();
if pageserver_conninfo.is_none() {
if let Some(pageserver_connstr_field) = &spec.pageserver_connstring {
pageserver_conninfo = Some(extract_pageserver_conninfo_from_connstr(
pageserver_connstr_field,
spec.shard_stripe_size,
)?);
}
}
if pageserver_conninfo.is_none() {
if let Some(guc) = spec.cluster.settings.find("neon.pageserver_connstring") {
let stripe_size = if let Some(guc) = spec.cluster.settings.find("neon.stripe_size")
{
Some(u32::from_str(&guc)?)
} else {
None
};
pageserver_conninfo =
Some(extract_pageserver_conninfo_from_connstr(&guc, stripe_size)?);
}
}
let pageserver_conninfo = pageserver_conninfo.ok_or(anyhow::anyhow!(
"pageserver connection information should be provided"
))?;
// Similarly for safekeeper connection strings
// For backwards-compatibility, the top-level fields in the spec file
// may be empty. In that case, we need to dig them from the GUCs in the
// cluster.settings field.
let pageserver_connstr = spec
.pageserver_connstring
.clone()
.or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
.ok_or("pageserver connstr should be provided")?;
let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
if matches!(spec.mode, ComputeMode::Primary) {
spec.cluster
.settings
.find("neon.safekeepers")
.ok_or(anyhow::anyhow!("safekeeper connstrings should be provided"))?
.ok_or("safekeeper connstrings should be provided")?
.split(',')
.map(|str| str.to_string())
.collect()
@@ -410,22 +321,22 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
tenant_id
} else {
let guc = spec
.cluster
spec.cluster
.settings
.find("neon.tenant_id")
.ok_or(anyhow::anyhow!("tenant id should be provided"))?;
TenantId::from_str(&guc).context("invalid tenant id")?
.ok_or("tenant id should be provided")
.map(|s| TenantId::from_str(&s))?
.or(Err("invalid tenant id"))?
};
let timeline_id: TimelineId = if let Some(timeline_id) = spec.timeline_id {
timeline_id
} else {
let guc = spec
.cluster
spec.cluster
.settings
.find("neon.timeline_id")
.ok_or(anyhow::anyhow!("timeline id should be provided"))?;
TimelineId::from_str(&guc).context(anyhow::anyhow!("invalid timeline id"))?
.ok_or("timeline id should be provided")
.map(|s| TimelineId::from_str(&s))?
.or(Err("invalid timeline id"))?
};
let endpoint_storage_addr: Option<String> = spec
@@ -439,7 +350,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
let res = ParsedSpec {
spec,
pageserver_conninfo,
pageserver_connstr,
safekeeper_connstrings,
storage_auth_token,
tenant_id,
@@ -449,7 +360,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
};
// Now check validity of the parsed specification
res.validate().map_err(anyhow::Error::msg)?;
res.validate()?;
Ok(res)
}
}
@@ -1139,10 +1050,12 @@ impl ComputeNode {
fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> {
let spec = compute_state.pspec.as_ref().expect("spec must be set");
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
let started = Instant::now();
let (connected, size) = match spec.pageserver_conninfo.prefer_protocol {
PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
let (connected, size) = match PageserverProtocol::from_connstring(shard0_connstr)? {
PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?,
PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
};
self.fix_zenith_signal_neon_signal()?;
@@ -1180,32 +1093,23 @@ impl ComputeNode {
/// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
/// the connection was established, and the (compressed) size of the basebackup.
fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
let shard0_index = ShardIndex {
shard_number: ShardNumber(0),
shard_count: spec.pageserver_conninfo.shard_count,
let shard0_connstr = spec
.pageserver_connstr
.split(',')
.next()
.unwrap()
.to_string();
let shard_index = match spec.pageserver_connstr.split(',').count() as u8 {
0 | 1 => ShardIndex::unsharded(),
count => ShardIndex::new(ShardNumber(0), ShardCount(count)),
};
let shard0 = spec
.pageserver_conninfo
.shards
.get(&shard0_index)
.ok_or_else(|| {
anyhow::anyhow!("shard connection info missing for shard {}", shard0_index)
})?;
let pageserver = shard0
.pageservers
.first()
.expect("must have at least one pageserver");
let shard0_url = pageserver
.grpc_url
.clone()
.expect("no grpc_url for shard 0");
let (reader, connected) = tokio::runtime::Handle::current().block_on(async move {
let mut client = page_api::Client::connect(
shard0_url,
shard0_connstr,
spec.tenant_id,
spec.timeline_id,
shard0_index,
shard_index,
spec.storage_auth_token.clone(),
None, // NB: base backups use payload compression
)
@@ -1237,26 +1141,8 @@ impl ComputeNode {
/// Fetches a basebackup via libpq. The connstring must use postgresql://. Returns the timestamp
/// when the connection was established, and the (compressed) size of the basebackup.
fn try_get_basebackup_libpq(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
let shard0_index = ShardIndex {
shard_number: ShardNumber(0),
shard_count: spec.pageserver_conninfo.shard_count,
};
let shard0 = spec
.pageserver_conninfo
.shards
.get(&shard0_index)
.ok_or_else(|| {
anyhow::anyhow!("shard connection info missing for shard {}", shard0_index)
})?;
let pageserver = shard0
.pageservers
.first()
.expect("must have at least one pageserver");
let shard0_connstr = pageserver
.libpq_url
.clone()
.expect("no libpq_url for shard 0");
let mut config = postgres::Config::from_str(&shard0_connstr)?;
let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
let mut config = postgres::Config::from_str(shard0_connstr)?;
// Use the storage auth token from the config file, if given.
// Note: this overrides any password set in the connection string.
@@ -1342,7 +1228,10 @@ impl ComputeNode {
return result;
}
Err(ref e) if attempts < max_attempts => {
warn!("Failed to get basebackup: {e:?} (attempt {attempts}/{max_attempts})");
warn!(
"Failed to get basebackup: {} (attempt {}/{})",
e, attempts, max_attempts
);
std::thread::sleep(std::time::Duration::from_millis(retry_period_ms as u64));
retry_period_ms *= 1.5;
}
@@ -1550,8 +1439,16 @@ impl ComputeNode {
}
};
self.get_basebackup(compute_state, lsn)
.with_context(|| format!("failed to get basebackup@{lsn}"))?;
info!(
"getting basebackup@{} from pageserver {}",
lsn, &pspec.pageserver_connstr
);
self.get_basebackup(compute_state, lsn).with_context(|| {
format!(
"failed to get basebackup@{} from pageserver {}",
lsn, &pspec.pageserver_connstr
)
})?;
// Update pg_hba.conf received with basebackup.
update_pg_hba(pgdata_path, None)?;
@@ -2495,22 +2392,22 @@ LIMIT 100",
/// The operation will time out after a specified duration.
pub fn wait_timeout_while_pageserver_connstr_unchanged(&self, duration: Duration) {
let state = self.state.lock().unwrap();
let old_pageserver_conninfo = state
let old_pageserver_connstr = state
.pspec
.as_ref()
.expect("spec must be set")
.pageserver_conninfo
.pageserver_connstr
.clone();
let mut unchanged = true;
let _ = self
.state_changed
.wait_timeout_while(state, duration, |s| {
let pageserver_conninfo = &s
let pageserver_connstr = &s
.pspec
.as_ref()
.expect("spec must be set")
.pageserver_conninfo;
unchanged = pageserver_conninfo == &old_pageserver_conninfo;
.pageserver_connstr;
unchanged = pageserver_connstr == &old_pageserver_connstr;
unchanged
})
.unwrap();
@@ -2740,10 +2637,7 @@ mod tests {
match ParsedSpec::try_from(spec.clone()) {
Ok(_p) => panic!("Failed to detect duplicate entry"),
Err(e) => assert!(
e.to_string()
.starts_with("duplicate entry in safekeeper_connstrings:")
),
Err(e) => assert!(e.starts_with("duplicate entry in safekeeper_connstrings:")),
};
}
}

View File

@@ -7,16 +7,17 @@ use std::io::prelude::*;
use std::path::Path;
use compute_api::responses::TlsConfig;
use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};
use compute_api::spec::{
ComputeAudit, ComputeMode, ComputeSpec, DatabricksSettings, GenericOption,
};
use crate::compute::ComputeNodeParams;
use crate::pg_helpers::{
GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
DatabricksSettingsExt, GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize,
escape_conf_value,
};
use crate::tls::{self, SERVER_CRT, SERVER_KEY};
use utils::shard::{ShardIndex, ShardNumber};
/// Check that `line` is inside a text file and put it there if it is not.
/// Create file if it doesn't exist.
pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
@@ -46,8 +47,10 @@ pub fn write_postgres_conf(
pgdata_path: &Path,
params: &ComputeNodeParams,
spec: &ComputeSpec,
postgres_port: u16,
extension_server_port: u16,
tls_config: &Option<TlsConfig>,
databricks_settings: Option<&DatabricksSettings>,
) -> Result<()> {
let path = pgdata_path.join("postgresql.conf");
// File::create() destroys the file content if it exists.
@@ -58,101 +61,15 @@ pub fn write_postgres_conf(
writeln!(file, "{conf}")?;
}
// Stripe size GUC should be defined prior to connection string
if let Some(stripe_size) = spec.shard_stripe_size {
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
// Add options for connecting to storage
writeln!(file, "# Neon storage settings")?;
writeln!(file)?;
if let Some(conninfo) = &spec.pageserver_connection_info {
// Stripe size GUC should be defined prior to connection string
if let Some(stripe_size) = conninfo.stripe_size {
writeln!(
file,
"# from compute spec's pageserver_conninfo.stripe_size field"
)?;
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
let mut libpq_urls: Option<Vec<String>> = Some(Vec::new());
let mut grpc_urls: Option<Vec<String>> = Some(Vec::new());
let num_shards = if conninfo.shard_count.0 == 0 {
1 // unsharded, treat it as a single shard
} else {
conninfo.shard_count.0
};
for shard_number in 0..num_shards {
let shard_index = ShardIndex {
shard_number: ShardNumber(shard_number),
shard_count: conninfo.shard_count,
};
let info = conninfo.shards.get(&shard_index).ok_or_else(|| {
anyhow::anyhow!(
"shard {shard_index} missing from pageserver_connection_info shard map"
)
})?;
let first_pageserver = info
.pageservers
.first()
.expect("must have at least one pageserver");
// Add the libpq URL to the array, or if the URL is missing, reset the array
// forgetting any previous entries. All servers must have a libpq URL, or none
// at all.
if let Some(url) = &first_pageserver.libpq_url {
if let Some(ref mut urls) = libpq_urls {
urls.push(url.clone());
}
} else {
libpq_urls = None
}
// Similarly for gRPC URLs
if let Some(url) = &first_pageserver.grpc_url {
if let Some(ref mut urls) = grpc_urls {
urls.push(url.clone());
}
} else {
grpc_urls = None
}
}
if let Some(libpq_urls) = libpq_urls {
writeln!(
file,
"# derived from compute spec's pageserver_conninfo field"
)?;
writeln!(
file,
"neon.pageserver_connstring={}",
escape_conf_value(&libpq_urls.join(","))
)?;
} else {
writeln!(file, "# no neon.pageserver_connstring")?;
}
if let Some(grpc_urls) = grpc_urls {
writeln!(
file,
"# derived from compute spec's pageserver_conninfo field"
)?;
writeln!(
file,
"neon.pageserver_grpc_urls={}",
escape_conf_value(&grpc_urls.join(","))
)?;
} else {
writeln!(file, "# no neon.pageserver_grpc_urls")?;
}
} else {
// Stripe size GUC should be defined prior to connection string
if let Some(stripe_size) = spec.shard_stripe_size {
writeln!(file, "# from compute spec's shard_stripe_size field")?;
writeln!(file, "neon.stripe_size={stripe_size}")?;
}
if let Some(s) = &spec.pageserver_connstring {
writeln!(file, "# from compute spec's pageserver_connstring field")?;
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
}
if let Some(s) = &spec.pageserver_connstring {
writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
}
if !spec.safekeeper_connstrings.is_empty() {
let mut neon_safekeepers_value = String::new();
tracing::info!(
@@ -373,6 +290,21 @@ pub fn write_postgres_conf(
writeln!(file, "log_destination='stderr,syslog'")?;
}
// Explicitly set the port based on the connstr, overriding any previous port setting.
// Note: It is important that we don't specify a different port again after this.
writeln!(file, "port = {}", postgres_port)?;
// This is databricks specific settings.
// This should be at the end of the file but before `compute_ctl_temp_override.conf` below
// so that it can override any settings above.
// `compute_ctl_temp_override.conf` is intended to override any settings above during specific operations.
// To prevent potential breakage in the future, we keep it above `compute_ctl_temp_override.conf`.
writeln!(file, "# Databricks settings start")?;
if let Some(settings) = databricks_settings {
writeln!(file, "{}", settings.as_pg_settings())?;
}
writeln!(file, "# Databricks settings end")?;
// This is essential to keep this line at the end of the file,
// because it is intended to override any settings above.
writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;

View File

@@ -4,13 +4,14 @@ use std::thread;
use std::time::{Duration, SystemTime};
use anyhow::{Result, bail};
use compute_api::spec::{ComputeMode, PageserverConnectionInfo, PageserverProtocol};
use compute_api::spec::{ComputeMode, PageserverProtocol};
use itertools::Itertools as _;
use pageserver_page_api as page_api;
use postgres::{NoTls, SimpleQueryMessage};
use tracing::{info, warn};
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use utils::shard::TenantShardId;
use utils::shard::{ShardCount, ShardNumber, TenantShardId};
use crate::compute::ComputeNode;
@@ -77,16 +78,17 @@ fn acquire_lsn_lease_with_retry(
loop {
// Note: List of pageservers is dynamic, need to re-read configs before each attempt.
let (conninfo, auth) = {
let (connstrings, auth) = {
let state = compute.state.lock().unwrap();
let spec = state.pspec.as_ref().expect("spec must be set");
(
spec.pageserver_conninfo.clone(),
spec.pageserver_connstr.clone(),
spec.storage_auth_token.clone(),
)
};
let result = try_acquire_lsn_lease(conninfo, auth.as_deref(), tenant_id, timeline_id, lsn);
let result =
try_acquire_lsn_lease(&connstrings, auth.as_deref(), tenant_id, timeline_id, lsn);
match result {
Ok(Some(res)) => {
return Ok(res);
@@ -110,44 +112,35 @@ fn acquire_lsn_lease_with_retry(
/// Tries to acquire LSN leases on all Pageserver shards.
fn try_acquire_lsn_lease(
conninfo: PageserverConnectionInfo,
connstrings: &str,
auth: Option<&str>,
tenant_id: TenantId,
timeline_id: TimelineId,
lsn: Lsn,
) -> Result<Option<SystemTime>> {
let connstrings = connstrings.split(',').collect_vec();
let shard_count = connstrings.len();
let mut leases = Vec::new();
for (shard_index, shard) in conninfo.shards.into_iter() {
let tenant_shard_id = TenantShardId {
tenant_id,
shard_number: shard_index.shard_number,
shard_count: shard_index.shard_count,
for (shard_number, &connstring) in connstrings.iter().enumerate() {
let tenant_shard_id = match shard_count {
0 | 1 => TenantShardId::unsharded(tenant_id),
shard_count => TenantShardId {
tenant_id,
shard_number: ShardNumber(shard_number as u8),
shard_count: ShardCount::new(shard_count as u8),
},
};
// XXX: If there are more than pageserver for the one shard, do we need to get a
// leas on all of them? Currently, that's what we assume, but this is hypothetical
// as of this writing, as we never pass the info for more than one pageserver per
// shard.
for pageserver in shard.pageservers {
let lease = match conninfo.prefer_protocol {
PageserverProtocol::Grpc => acquire_lsn_lease_grpc(
&pageserver.grpc_url.unwrap(),
auth,
tenant_shard_id,
timeline_id,
lsn,
)?,
PageserverProtocol::Libpq => acquire_lsn_lease_libpq(
&pageserver.libpq_url.unwrap(),
auth,
tenant_shard_id,
timeline_id,
lsn,
)?,
};
leases.push(lease);
}
let lease = match PageserverProtocol::from_connstring(connstring)? {
PageserverProtocol::Libpq => {
acquire_lsn_lease_libpq(connstring, auth, tenant_shard_id, timeline_id, lsn)?
}
PageserverProtocol::Grpc => {
acquire_lsn_lease_grpc(connstring, auth, tenant_shard_id, timeline_id, lsn)?
}
};
leases.push(lease);
}
Ok(leases.into_iter().min().flatten())

View File

@@ -16,14 +16,9 @@ use std::time::Duration;
use anyhow::{Context, Result, anyhow, bail};
use clap::Parser;
use compute_api::requests::ComputeClaimsScope;
use compute_api::spec::{
ComputeMode, PageserverConnectionInfo, PageserverProtocol, PageserverShardInfo,
};
use compute_api::spec::{ComputeMode, PageserverProtocol};
use control_plane::broker::StorageBroker;
use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode};
use control_plane::endpoint::{
pageserver_conf_to_shard_conn_info, tenant_locate_response_to_conn_info,
};
use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage};
use control_plane::local_env;
use control_plane::local_env::{
@@ -49,6 +44,7 @@ use pageserver_api::models::{
};
use pageserver_api::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId};
use postgres_backend::AuthType;
use postgres_connection::parse_host_port;
use safekeeper_api::membership::{SafekeeperGeneration, SafekeeperId};
use safekeeper_api::{
DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
@@ -56,11 +52,11 @@ use safekeeper_api::{
};
use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
use tokio::task::JoinSet;
use url::Host;
use utils::auth::{Claims, Scope};
use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
use utils::lsn::Lsn;
use utils::project_git_version;
use utils::shard::ShardIndex;
// Default id of a safekeeper node, if not specified on the command line.
const DEFAULT_SAFEKEEPER_ID: NodeId = NodeId(1);
@@ -1531,56 +1527,62 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
)?;
}
let prefer_protocol = if endpoint.grpc {
PageserverProtocol::Grpc
} else {
PageserverProtocol::Libpq
};
let mut pageserver_conninfo = if let Some(ps_id) = pageserver_id {
let conf = env.get_pageserver_conf(ps_id).unwrap();
let ps_conninfo = pageserver_conf_to_shard_conn_info(conf)?;
let shard_info = PageserverShardInfo {
pageservers: vec![ps_conninfo],
let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
let conf = env.get_pageserver_conf(pageserver_id).unwrap();
// Use gRPC if requested.
let pageserver = if endpoint.grpc {
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
let (host, port) = parse_host_port(grpc_addr)?;
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
(PageserverProtocol::Grpc, host, port)
} else {
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
let port = port.unwrap_or(5432);
(PageserverProtocol::Libpq, host, port)
};
// If caller is telling us what pageserver to use, this is not a tenant which is
// fully managed by storage controller, therefore not sharded.
let shards: HashMap<_, _> = vec![(ShardIndex::unsharded(), shard_info)]
.into_iter()
.collect();
PageserverConnectionInfo {
shard_count: ShardCount(0),
stripe_size: None,
shards,
prefer_protocol,
}
(vec![pageserver], DEFAULT_STRIPE_SIZE)
} else {
// Look up the currently attached location of the tenant, and its striping metadata,
// to pass these on to postgres.
let storage_controller = StorageController::from_env(env);
let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
assert!(!locate_result.shards.is_empty());
// Initialize LSN leases for static computes.
if let ComputeMode::Static(lsn) = endpoint.mode {
futures::future::try_join_all(locate_result.shards.iter().map(
|shard| async move {
let pageservers = futures::future::try_join_all(
locate_result.shards.into_iter().map(|shard| async move {
if let ComputeMode::Static(lsn) = endpoint.mode {
// Initialize LSN leases for static computes.
let conf = env.get_pageserver_conf(shard.node_id).unwrap();
let pageserver = PageServerNode::from_env(env, conf);
pageserver
.http_client
.timeline_init_lsn_lease(shard.shard_id, endpoint.timeline_id, lsn)
.await
},
))
.await?;
}
.await?;
}
tenant_locate_response_to_conn_info(&locate_result)?
let pageserver = if endpoint.grpc {
(
PageserverProtocol::Grpc,
Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))?,
shard.listen_grpc_port.expect("no gRPC port"),
)
} else {
(
PageserverProtocol::Libpq,
Host::parse(&shard.listen_pg_addr)?,
shard.listen_pg_port,
)
};
anyhow::Ok(pageserver)
}),
)
.await?;
let stripe_size = locate_result.shard_params.stripe_size;
(pageservers, stripe_size)
};
pageserver_conninfo.prefer_protocol = prefer_protocol;
assert!(!pageservers.is_empty());
let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?;
let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
@@ -1610,8 +1612,9 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
endpoint_storage_addr,
safekeepers_generation,
safekeepers,
pageserver_conninfo,
pageservers,
remote_ext_base_url: remote_ext_base_url.clone(),
shard_stripe_size: stripe_size.0 as usize,
create_test_user: args.create_test_user,
start_timeout: args.start_timeout,
autoprewarm: args.autoprewarm,
@@ -1628,45 +1631,51 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
.endpoints
.get(endpoint_id.as_str())
.with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
let prefer_protocol = if endpoint.grpc {
PageserverProtocol::Grpc
} else {
PageserverProtocol::Libpq
};
let mut pageserver_conninfo = if let Some(ps_id) = args.endpoint_pageserver_id {
let pageservers = if let Some(ps_id) = args.endpoint_pageserver_id {
let conf = env.get_pageserver_conf(ps_id)?;
let ps_conninfo = pageserver_conf_to_shard_conn_info(conf)?;
let shard_info = PageserverShardInfo {
pageservers: vec![ps_conninfo],
// Use gRPC if requested.
let pageserver = if endpoint.grpc {
let grpc_addr = conf.listen_grpc_addr.as_ref().expect("bad config");
let (host, port) = parse_host_port(grpc_addr)?;
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
(PageserverProtocol::Grpc, host, port)
} else {
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
let port = port.unwrap_or(5432);
(PageserverProtocol::Libpq, host, port)
};
// If caller is telling us what pageserver to use, this is not a tenant which is
// fully managed by storage controller, therefore not sharded.
let shards: HashMap<_, _> = vec![(ShardIndex::unsharded(), shard_info)]
.into_iter()
.collect();
PageserverConnectionInfo {
shard_count: ShardCount::unsharded(),
stripe_size: None,
shards,
prefer_protocol,
}
vec![pageserver]
} else {
// Look up the currently attached location of the tenant, and its striping metadata,
// to pass these on to postgres.
let storage_controller = StorageController::from_env(env);
let locate_result = storage_controller.tenant_locate(endpoint.tenant_id).await?;
tenant_locate_response_to_conn_info(&locate_result)?
storage_controller
.tenant_locate(endpoint.tenant_id)
.await?
.shards
.into_iter()
.map(|shard| {
// Use gRPC if requested.
if endpoint.grpc {
(
PageserverProtocol::Grpc,
Host::parse(&shard.listen_grpc_addr.expect("no gRPC address"))
.expect("bad hostname"),
shard.listen_grpc_port.expect("no gRPC port"),
)
} else {
(
PageserverProtocol::Libpq,
Host::parse(&shard.listen_pg_addr).expect("bad hostname"),
shard.listen_pg_port,
)
}
})
.collect::<Vec<_>>()
};
pageserver_conninfo.prefer_protocol = prefer_protocol;
// If --safekeepers argument is given, use only the listed
// safekeeper nodes; otherwise all from the env.
let safekeepers = parse_safekeepers(&args.safekeepers)?;
endpoint
.reconfigure(Some(&pageserver_conninfo), safekeepers, None)
.reconfigure(Some(pageservers), None, safekeepers, None)
.await?;
}
EndpointCmd::Stop(args) => {

View File

@@ -37,7 +37,7 @@
//! <other PostgreSQL files>
//! ```
//!
use std::collections::{BTreeMap, HashMap};
use std::collections::BTreeMap;
use std::fmt::Display;
use std::net::{IpAddr, Ipv4Addr, SocketAddr, TcpStream};
use std::path::PathBuf;
@@ -58,12 +58,8 @@ use compute_api::responses::{
};
use compute_api::spec::{
Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol,
PageserverShardInfo, PgIdent, RemoteExtSpec, Role,
PgIdent, RemoteExtSpec, Role,
};
// re-export these, because they're used in the reconfigure() function
pub use compute_api::spec::{PageserverConnectionInfo, PageserverShardConnectionInfo};
use jsonwebtoken::jwk::{
AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations,
OctetKeyPairParameters, OctetKeyPairType, PublicKeyUse,
@@ -78,11 +74,9 @@ use sha2::{Digest, Sha256};
use spki::der::Decode;
use spki::{SubjectPublicKeyInfo, SubjectPublicKeyInfoRef};
use tracing::debug;
use url::Host;
use utils::id::{NodeId, TenantId, TimelineId};
use utils::shard::{ShardIndex, ShardNumber};
use pageserver_api::config::DEFAULT_GRPC_LISTEN_PORT as DEFAULT_PAGESERVER_GRPC_PORT;
use postgres_connection::parse_host_port;
use utils::shard::ShardStripeSize;
use crate::local_env::LocalEnv;
use crate::postgresql_conf::PostgresConf;
@@ -393,8 +387,9 @@ pub struct EndpointStartArgs {
pub endpoint_storage_addr: String,
pub safekeepers_generation: Option<SafekeeperGeneration>,
pub safekeepers: Vec<NodeId>,
pub pageserver_conninfo: PageserverConnectionInfo,
pub pageservers: Vec<(PageserverProtocol, Host, u16)>,
pub remote_ext_base_url: Option<String>,
pub shard_stripe_size: usize,
pub create_test_user: bool,
pub start_timeout: Duration,
pub autoprewarm: bool,
@@ -667,6 +662,14 @@ impl Endpoint {
}
}
fn build_pageserver_connstr(pageservers: &[(PageserverProtocol, Host, u16)]) -> String {
pageservers
.iter()
.map(|(scheme, host, port)| format!("{scheme}://no_user@{host}:{port}"))
.collect::<Vec<_>>()
.join(",")
}
/// Map safekeepers ids to the actual connection strings.
fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
let mut safekeeper_connstrings = Vec::new();
@@ -712,6 +715,9 @@ impl Endpoint {
std::fs::remove_dir_all(self.pgdata())?;
}
let pageserver_connstring = Self::build_pageserver_connstr(&args.pageservers);
assert!(!pageserver_connstring.is_empty());
let safekeeper_connstrings = self.build_safekeepers_connstrs(args.safekeepers)?;
// check for file remote_extensions_spec.json
@@ -726,45 +732,6 @@ impl Endpoint {
remote_extensions = None;
};
// For the sake of backwards-compatibility, also fill in 'pageserver_connstring'
//
// Use a closure so that we can conviniently return None in the middle of the
// loop.
let pageserver_connstring: Option<String> = (|| {
let num_shards = if args.pageserver_conninfo.shard_count.is_unsharded() {
1
} else {
args.pageserver_conninfo.shard_count.0
};
let mut connstrings = Vec::new();
for shard_no in 0..num_shards {
let shard_index = ShardIndex {
shard_count: args.pageserver_conninfo.shard_count,
shard_number: ShardNumber(shard_no),
};
let shard = args
.pageserver_conninfo
.shards
.get(&shard_index)
.ok_or_else(|| {
anyhow!(
"shard {} not found in pageserver_connection_info",
shard_index
)
})?;
let pageserver = shard
.pageservers
.first()
.ok_or(anyhow!("must have at least one pageserver"))?;
if let Some(libpq_url) = &pageserver.libpq_url {
connstrings.push(libpq_url.clone());
} else {
return Ok::<_, anyhow::Error>(None);
}
}
Ok(Some(connstrings.join(",")))
})()?;
// Create config file
let config = {
let mut spec = ComputeSpec {
@@ -809,14 +776,13 @@ impl Endpoint {
branch_id: None,
endpoint_id: Some(self.endpoint_id.clone()),
mode: self.mode,
pageserver_connection_info: Some(args.pageserver_conninfo.clone()),
pageserver_connstring,
pageserver_connstring: Some(pageserver_connstring),
safekeepers_generation: args.safekeepers_generation.map(|g| g.into_inner()),
safekeeper_connstrings,
storage_auth_token: args.auth_token.clone(),
remote_extensions,
pgbouncer_settings: None,
shard_stripe_size: args.pageserver_conninfo.stripe_size, // redundant with pageserver_connection_info.stripe_size
shard_stripe_size: Some(args.shard_stripe_size),
local_proxy_config: None,
reconfigure_concurrency: self.reconfigure_concurrency,
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
@@ -1028,7 +994,8 @@ impl Endpoint {
pub async fn reconfigure(
&self,
pageserver_conninfo: Option<&PageserverConnectionInfo>,
pageservers: Option<Vec<(PageserverProtocol, Host, u16)>>,
stripe_size: Option<ShardStripeSize>,
safekeepers: Option<Vec<NodeId>>,
safekeeper_generation: Option<SafekeeperGeneration>,
) -> Result<()> {
@@ -1043,15 +1010,15 @@ impl Endpoint {
let postgresql_conf = self.read_postgresql_conf()?;
spec.cluster.postgresql_conf = Some(postgresql_conf);
if let Some(pageserver_conninfo) = pageserver_conninfo {
// If pageservers are provided, we need to ensure that they are not empty.
// This is a requirement for the compute_ctl configuration.
anyhow::ensure!(
!pageserver_conninfo.shards.is_empty(),
"no pageservers provided"
);
spec.pageserver_connection_info = Some(pageserver_conninfo.clone());
spec.shard_stripe_size = pageserver_conninfo.stripe_size;
// If pageservers are not specified, don't change them.
if let Some(pageservers) = pageservers {
anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided");
let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
spec.pageserver_connstring = Some(pageserver_connstr);
if stripe_size.is_some() {
spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
}
}
// If safekeepers are not specified, don't change them.
@@ -1100,9 +1067,11 @@ impl Endpoint {
pub async fn reconfigure_pageservers(
&self,
pageservers: &PageserverConnectionInfo,
pageservers: Vec<(PageserverProtocol, Host, u16)>,
stripe_size: Option<ShardStripeSize>,
) -> Result<()> {
self.reconfigure(Some(pageservers), None, None).await
self.reconfigure(Some(pageservers), stripe_size, None, None)
.await
}
pub async fn reconfigure_safekeepers(
@@ -1110,7 +1079,7 @@ impl Endpoint {
safekeepers: Vec<NodeId>,
generation: SafekeeperGeneration,
) -> Result<()> {
self.reconfigure(None, Some(safekeepers), Some(generation))
self.reconfigure(None, None, Some(safekeepers), Some(generation))
.await
}
@@ -1166,68 +1135,3 @@ impl Endpoint {
)
}
}
pub fn pageserver_conf_to_shard_conn_info(
conf: &crate::local_env::PageServerConf,
) -> Result<PageserverShardConnectionInfo> {
let libpq_url = {
let (host, port) = parse_host_port(&conf.listen_pg_addr)?;
let port = port.unwrap_or(5432);
Some(format!("postgres://no_user@{host}:{port}"))
};
let grpc_url = if let Some(grpc_addr) = &conf.listen_grpc_addr {
let (host, port) = parse_host_port(grpc_addr)?;
let port = port.unwrap_or(DEFAULT_PAGESERVER_GRPC_PORT);
Some(format!("grpc://no_user@{host}:{port}"))
} else {
None
};
Ok(PageserverShardConnectionInfo {
id: Some(conf.id.to_string()),
libpq_url,
grpc_url,
})
}
pub fn tenant_locate_response_to_conn_info(
response: &pageserver_api::controller_api::TenantLocateResponse,
) -> Result<PageserverConnectionInfo> {
let mut shards = HashMap::new();
for shard in response.shards.iter() {
tracing::info!("parsing {}", shard.listen_pg_addr);
let libpq_url = {
let host = &shard.listen_pg_addr;
let port = shard.listen_pg_port;
Some(format!("postgres://no_user@{host}:{port}"))
};
let grpc_url = if let Some(grpc_addr) = &shard.listen_grpc_addr {
let host = grpc_addr;
let port = shard.listen_grpc_port.expect("no gRPC port");
Some(format!("grpc://no_user@{host}:{port}"))
} else {
None
};
let shard_info = PageserverShardInfo {
pageservers: vec![PageserverShardConnectionInfo {
id: Some(shard.node_id.to_string()),
libpq_url,
grpc_url,
}],
};
shards.insert(shard.shard_id.to_index(), shard_info);
}
let stripe_size = if response.shard_params.count.is_unsharded() {
None
} else {
Some(response.shard_params.stripe_size.0)
};
Ok(PageserverConnectionInfo {
shard_count: response.shard_params.count,
stripe_size,
shards,
prefer_protocol: PageserverProtocol::default(),
})
}

View File

@@ -14,7 +14,6 @@ use serde::{Deserialize, Serialize};
use url::Url;
use utils::id::{TenantId, TimelineId};
use utils::lsn::Lsn;
use utils::shard::{ShardCount, ShardIndex};
use crate::responses::TlsConfig;
@@ -106,17 +105,6 @@ pub struct ComputeSpec {
// updated to fill these fields, we can make these non optional.
pub tenant_id: Option<TenantId>,
pub timeline_id: Option<TimelineId>,
/// Pageserver information can be passed in three different ways:
/// 1. Here in `pageserver_connection_info`
/// 2. In the `pageserver_connstring` field.
/// 3. in `cluster.settings`.
///
/// The goal is to use method 1. everywhere. But for backwards-compatibility with old
/// versions of the control plane, `compute_ctl` will check 2. and 3. if the
/// `pageserver_connection_info` field is missing.
pub pageserver_connection_info: Option<PageserverConnectionInfo>,
pub pageserver_connstring: Option<String>,
// More neon ids that we expose to the compute_ctl
@@ -153,7 +141,7 @@ pub struct ComputeSpec {
// Stripe size for pageserver sharding, in pages
#[serde(default)]
pub shard_stripe_size: Option<u32>,
pub shard_stripe_size: Option<usize>,
/// Local Proxy configuration used for JWT authentication
#[serde(default)]
@@ -226,32 +214,6 @@ pub enum ComputeFeature {
UnknownFeature,
}
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
pub struct PageserverConnectionInfo {
/// NB: 0 for unsharded tenants, 1 for sharded tenants with 1 shard, following storage
pub shard_count: ShardCount,
/// INVARIANT: null if shard_count is 0, otherwise non-null and immutable
pub stripe_size: Option<u32>,
pub shards: HashMap<ShardIndex, PageserverShardInfo>,
#[serde(default)]
pub prefer_protocol: PageserverProtocol,
}
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
pub struct PageserverShardInfo {
pub pageservers: Vec<PageserverShardConnectionInfo>,
}
#[derive(Clone, Debug, Deserialize, Serialize, Eq, PartialEq)]
pub struct PageserverShardConnectionInfo {
pub id: Option<String>,
pub libpq_url: Option<String>,
pub grpc_url: Option<String>,
}
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
pub struct RemoteExtSpec {
pub public_extensions: Option<Vec<String>>,
@@ -369,12 +331,6 @@ impl ComputeMode {
}
}
impl Display for ComputeMode {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(self.to_type_str())
}
}
/// Log level for audit logging
#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
pub enum ComputeAudit {
@@ -511,15 +467,13 @@ pub struct JwksSettings {
pub jwt_audience: Option<String>,
}
/// Protocol used to connect to a Pageserver.
#[derive(Clone, Copy, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme.
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub enum PageserverProtocol {
/// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme.
#[default]
#[serde(rename = "libpq")]
Libpq,
/// A newer, gRPC-based protocol. Uses grpc:// scheme.
#[serde(rename = "grpc")]
Grpc,
}

View File

@@ -6,26 +6,15 @@ license.workspace = true
[dependencies]
thiserror.workspace = true
nix.workspace = true
nix.workspace=true
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
libc.workspace = true
lock_api.workspace = true
rustc-hash.workspace = true
[dev-dependencies]
criterion = { workspace = true, features = ["html_reports"] }
rand = "0.9"
rand_distr = "0.5.1"
xxhash-rust = { version = "0.8.15", features = ["xxh3"] }
ahash.workspace = true
twox-hash = { version = "2.1.1" }
seahash = "4.1.0"
hashbrown = { git = "https://github.com/quantumish/hashbrown.git", rev = "6610e6d" }
[target.'cfg(target_os = "macos")'.dependencies]
tempfile = "3.14.0"
[[bench]]
name = "hmap_resize"
harness = false
[dev-dependencies]
rand.workspace = true
rand_distr = "0.5.1"

View File

@@ -1,330 +0,0 @@
use criterion::{BatchSize, BenchmarkId, Criterion, criterion_group, criterion_main};
use neon_shmem::hash::HashMapAccess;
use neon_shmem::hash::HashMapInit;
use neon_shmem::hash::entry::Entry;
use rand::distr::{Distribution, StandardUniform};
use rand::prelude::*;
use std::default::Default;
use std::hash::BuildHasher;
// Taken from bindings to C code
#[derive(Clone, Debug, Hash, Eq, PartialEq)]
#[repr(C)]
pub struct FileCacheKey {
pub _spc_id: u32,
pub _db_id: u32,
pub _rel_number: u32,
pub _fork_num: u32,
pub _block_num: u32,
}
impl Distribution<FileCacheKey> for StandardUniform {
// questionable, but doesn't need to be good randomness
fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> FileCacheKey {
FileCacheKey {
_spc_id: rng.random(),
_db_id: rng.random(),
_rel_number: rng.random(),
_fork_num: rng.random(),
_block_num: rng.random(),
}
}
}
#[derive(Clone, Debug)]
#[repr(C)]
pub struct FileCacheEntry {
pub _offset: u32,
pub _access_count: u32,
pub _prev: *mut FileCacheEntry,
pub _next: *mut FileCacheEntry,
pub _state: [u32; 8],
}
impl FileCacheEntry {
fn dummy() -> Self {
Self {
_offset: 0,
_access_count: 0,
_prev: std::ptr::null_mut(),
_next: std::ptr::null_mut(),
_state: [0; 8],
}
}
}
// Utilities for applying operations.
#[derive(Clone, Debug)]
struct TestOp<K, V>(K, Option<V>);
fn apply_op<K: Clone + std::hash::Hash + Eq, V, S: std::hash::BuildHasher>(
op: TestOp<K, V>,
map: &mut HashMapAccess<K, V, S>,
) {
let entry = map.entry(op.0);
match op.1 {
Some(new) => match entry {
Entry::Occupied(mut e) => Some(e.insert(new)),
Entry::Vacant(e) => {
_ = e.insert(new).unwrap();
None
}
},
None => match entry {
Entry::Occupied(e) => Some(e.remove()),
Entry::Vacant(_) => None,
},
};
}
// Hash utilities
struct SeaRandomState {
k1: u64,
k2: u64,
k3: u64,
k4: u64,
}
impl std::hash::BuildHasher for SeaRandomState {
type Hasher = seahash::SeaHasher;
fn build_hasher(&self) -> Self::Hasher {
seahash::SeaHasher::with_seeds(self.k1, self.k2, self.k3, self.k4)
}
}
impl SeaRandomState {
fn new() -> Self {
let mut rng = rand::rng();
Self {
k1: rng.random(),
k2: rng.random(),
k3: rng.random(),
k4: rng.random(),
}
}
}
fn small_benchs(c: &mut Criterion) {
let mut group = c.benchmark_group("Small maps");
group.sample_size(10);
group.bench_function("small_rehash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2).attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("small_rehash_xxhash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2)
.with_hasher(twox_hash::xxhash64::RandomState::default())
.attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("small_rehash_ahash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2)
.with_hasher(ahash::RandomState::default())
.attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("small_rehash_seahash", |b| {
let ideal_filled = 4_000_000;
let size = 5_000_000;
let mut writer = HashMapInit::new_resizeable(size, size * 2)
.with_hasher(SeaRandomState::new())
.attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.finish();
}
fn real_benchs(c: &mut Criterion) {
let mut group = c.benchmark_group("Realistic workloads");
group.sample_size(10);
group.bench_function("real_bulk_insert", |b| {
let size = 125_000_000;
let ideal_filled = 100_000_000;
let mut rng = rand::rng();
b.iter_batched(
|| HashMapInit::new_resizeable(size, size * 2).attach_writer(),
|writer| {
for _ in 0..ideal_filled {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
let entry = writer.entry(key);
match entry {
Entry::Occupied(mut e) => {
std::hint::black_box(e.insert(val));
}
Entry::Vacant(e) => {
let _ = std::hint::black_box(e.insert(val).unwrap());
}
}
}
},
BatchSize::SmallInput,
)
});
group.bench_function("real_rehash", |b| {
let size = 125_000_000;
let ideal_filled = 100_000_000;
let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
});
group.bench_function("real_rehash_hashbrown", |b| {
let size = 125_000_000;
let ideal_filled = 100_000_000;
let mut writer = hashbrown::raw::RawTable::new();
let mut rng = rand::rng();
let hasher = rustc_hash::FxBuildHasher;
unsafe {
writer
.resize(
size,
|(k, _)| hasher.hash_one(k),
hashbrown::raw::Fallibility::Infallible,
)
.unwrap();
}
while writer.len() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
hasher.hash_one(k)
});
}
b.iter(|| unsafe {
writer.table.rehash_in_place(
&|table, index| {
hasher.hash_one(
&table
.bucket::<(FileCacheKey, FileCacheEntry)>(index)
.as_ref()
.0,
)
},
std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
Some(|ptr| std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry)))
} else {
None
},
)
});
});
for elems in [2, 4, 8, 16, 32, 64, 96, 112] {
group.bench_with_input(
BenchmarkId::new("real_rehash_varied", elems),
&elems,
|b, &size| {
let ideal_filled = size * 1_000_000;
let size = 125_000_000;
let mut writer = HashMapInit::new_resizeable(size, size).attach_writer();
let mut rng = rand::rng();
while writer.get_num_buckets_in_use() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
apply_op(TestOp(key, Some(val)), &mut writer);
}
b.iter(|| writer.shuffle());
},
);
group.bench_with_input(
BenchmarkId::new("real_rehash_varied_hashbrown", elems),
&elems,
|b, &size| {
let ideal_filled = size * 1_000_000;
let size = 125_000_000;
let mut writer = hashbrown::raw::RawTable::new();
let mut rng = rand::rng();
let hasher = rustc_hash::FxBuildHasher;
unsafe {
writer
.resize(
size,
|(k, _)| hasher.hash_one(k),
hashbrown::raw::Fallibility::Infallible,
)
.unwrap();
}
while writer.len() < ideal_filled as usize {
let key: FileCacheKey = rng.random();
let val = FileCacheEntry::dummy();
writer.insert(hasher.hash_one(&key), (key, val), |(k, _)| {
hasher.hash_one(k)
});
}
b.iter(|| unsafe {
writer.table.rehash_in_place(
&|table, index| {
hasher.hash_one(
&table
.bucket::<(FileCacheKey, FileCacheEntry)>(index)
.as_ref()
.0,
)
},
std::mem::size_of::<(FileCacheKey, FileCacheEntry)>(),
if std::mem::needs_drop::<(FileCacheKey, FileCacheEntry)>() {
Some(|ptr| {
std::ptr::drop_in_place(ptr as *mut (FileCacheKey, FileCacheEntry))
})
} else {
None
},
)
});
},
);
}
group.finish();
}
criterion_group!(benches, small_benchs, real_benchs);
criterion_main!(benches);

View File

@@ -16,7 +16,6 @@
//!
//! Concurrency is managed very simply: the entire map is guarded by one shared-memory RwLock.
use std::fmt::Debug;
use std::hash::{BuildHasher, Hash};
use std::mem::MaybeUninit;
@@ -57,22 +56,6 @@ pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> {
num_buckets: u32,
}
impl<'a, K, V, S> Debug for HashMapInit<'a, K, V, S>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("HashMapInit")
.field("shmem_handle", &self.shmem_handle)
.field("shared_ptr", &self.shared_ptr)
.field("shared_size", &self.shared_size)
// .field("hasher", &self.hasher)
.field("num_buckets", &self.num_buckets)
.finish()
}
}
/// This is a per-process handle to a hash table that (possibly) lives in shared memory.
/// If a child process is launched with fork(), the child process should
/// get its own HashMapAccess by calling HashMapInit::attach_writer/reader().
@@ -88,20 +71,6 @@ pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> {
unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<'_, K, V, S> {}
unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<'_, K, V, S> {}
impl<'a, K, V, S> Debug for HashMapAccess<'a, K, V, S>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("HashMapAccess")
.field("shmem_handle", &self.shmem_handle)
.field("shared_ptr", &self.shared_ptr)
// .field("hasher", &self.hasher)
.finish()
}
}
impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
/// Change the 'hasher' used by the hash table.
///
@@ -329,7 +298,7 @@ where
/// Get a reference to the entry containing a key.
///
/// NB: This takes a write lock as there's no way to distinguish whether the intention
/// NB: THis takes a write lock as there's no way to distinguish whether the intention
/// is to use the entry for reading or for writing in advance.
pub fn entry(&self, key: K) -> Entry<'a, '_, K, V> {
let hash = self.get_hash_value(&key);

View File

@@ -1,6 +1,5 @@
//! Simple hash table with chaining.
use std::fmt::Debug;
use std::hash::Hash;
use std::mem::MaybeUninit;
@@ -18,19 +17,6 @@ pub(crate) struct Bucket<K, V> {
pub(crate) inner: Option<(K, V)>,
}
impl<K, V> Debug for Bucket<K, V>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("Bucket")
.field("next", &self.next)
.field("inner", &self.inner)
.finish()
}
}
/// Core hash table implementation.
pub(crate) struct CoreHashMap<'a, K, V> {
/// Dictionary used to map hashes to bucket indices.
@@ -45,22 +31,6 @@ pub(crate) struct CoreHashMap<'a, K, V> {
pub(crate) buckets_in_use: u32,
}
impl<'a, K, V> Debug for CoreHashMap<'a, K, V>
where
K: Debug,
V: Debug,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("CoreHashMap")
.field("dictionary", &self.dictionary)
.field("buckets", &self.buckets)
.field("free_head", &self.free_head)
.field("alloc_limit", &self.alloc_limit)
.field("buckets_in_use", &self.buckets_in_use)
.finish()
}
}
/// Error for when there are no empty buckets left but one is needed.
#[derive(Debug, PartialEq)]
pub struct FullError;

View File

@@ -61,10 +61,6 @@ impl<K, V> OccupiedEntry<'_, '_, K, V> {
///
/// This may result in multiple bucket accesses if the entry was obtained by index as the
/// previous chain entry needs to be discovered in this case.
///
/// # Panics
/// Panics if the `prev_pos` field is equal to [`PrevPos::Unknown`]. In practice, this means
/// the entry was obtained via calling something like [`super::HashMapAccess::entry_at_bucket`].
pub fn remove(mut self) -> V {
// If this bucket was queried by index, go ahead and follow its chain from the start.
let prev = if let PrevPos::Unknown(hash) = self.prev_pos {

View File

@@ -21,7 +21,6 @@ use nix::unistd::ftruncate as nix_ftruncate;
/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
/// will cause the file to be expanded, but we might use `mprotect()` etc. to enforce that in the
/// future.
#[derive(Debug)]
pub struct ShmemHandle {
/// memfd file descriptor
fd: OwnedFd,
@@ -36,7 +35,6 @@ pub struct ShmemHandle {
}
/// This is stored at the beginning in the shared memory area.
#[derive(Debug)]
struct SharedStruct {
max_size: usize,

View File

@@ -8,7 +8,7 @@ license.workspace = true
hyper0.workspace = true
opentelemetry = { workspace = true, features = ["trace"] }
opentelemetry_sdk = { workspace = true, features = ["rt-tokio"] }
opentelemetry-otlp = { workspace = true, default-features = false, features = ["http-proto", "trace", "http", "reqwest-blocking-client"] }
opentelemetry-otlp = { workspace = true, default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] }
opentelemetry-semantic-conventions.workspace = true
tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
tracing.workspace = true

View File

@@ -59,10 +59,6 @@ impl ShardCount {
pub const MAX: Self = Self(u8::MAX);
pub const MIN: Self = Self(0);
pub fn unsharded() -> Self {
ShardCount(0)
}
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
/// legacy format for TenantShardId that excludes the shard suffix", also known
/// as [`TenantShardId::unsharded`].

View File

@@ -54,7 +54,6 @@ pageserver_api.workspace = true
pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
pageserver_compaction.workspace = true
pageserver_page_api.workspace = true
peekable.workspace = true
pem.workspace = true
pin-project-lite.workspace = true
postgres_backend.workspace = true
@@ -67,7 +66,6 @@ postgres-types.workspace = true
posthog_client_lite.workspace = true
pprof.workspace = true
pq_proto.workspace = true
prost.workspace = true
rand.workspace = true
range-set-blaze = { version = "0.1.16", features = ["alloc"] }
regex.workspace = true

View File

@@ -14,9 +14,9 @@ use utils::logging::warn_slow;
use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
use crate::retry::Retry;
use crate::split::GetPageSplitter;
use compute_api::spec::PageserverProtocol;
use pageserver_page_api as page_api;
use pageserver_page_api::GetPageSplitter;
use utils::id::{TenantId, TimelineId};
use utils::shard::{ShardCount, ShardIndex, ShardNumber, ShardStripeSize};

View File

@@ -1,6 +1,6 @@
mod client;
mod pool;
mod retry;
mod split;
pub use client::{PageserverClient, ShardSpec};
pub use pageserver_api::shard::ShardStripeSize; // used in ShardSpec

View File

@@ -3,18 +3,18 @@ use std::collections::HashMap;
use anyhow::anyhow;
use bytes::Bytes;
use crate::model::*;
use pageserver_api::key::rel_block_to_key;
use pageserver_api::shard::key_to_shard_number;
use pageserver_page_api as page_api;
use utils::shard::{ShardCount, ShardIndex, ShardStripeSize};
/// Splits GetPageRequests that straddle shard boundaries and assembles the responses.
/// TODO: add tests for this.
pub struct GetPageSplitter {
/// Split requests by shard index.
requests: HashMap<ShardIndex, GetPageRequest>,
requests: HashMap<ShardIndex, page_api::GetPageRequest>,
/// The response being assembled. Preallocated with empty pages, to be filled in.
response: GetPageResponse,
response: page_api::GetPageResponse,
/// Maps the offset in `request.block_numbers` and `response.pages` to the owning shard. Used
/// to assemble the response pages in the same order as the original request.
block_shards: Vec<ShardIndex>,
@@ -24,7 +24,7 @@ impl GetPageSplitter {
/// Checks if the given request only touches a single shard, and returns the shard ID. This is
/// the common case, so we check first in order to avoid unnecessary allocations and overhead.
pub fn for_single_shard(
req: &GetPageRequest,
req: &page_api::GetPageRequest,
count: ShardCount,
stripe_size: Option<ShardStripeSize>,
) -> anyhow::Result<Option<ShardIndex>> {
@@ -57,7 +57,7 @@ impl GetPageSplitter {
/// Splits the given request.
pub fn split(
req: GetPageRequest,
req: page_api::GetPageRequest,
count: ShardCount,
stripe_size: Option<ShardStripeSize>,
) -> anyhow::Result<Self> {
@@ -84,7 +84,7 @@ impl GetPageSplitter {
requests
.entry(shard_id)
.or_insert_with(|| GetPageRequest {
.or_insert_with(|| page_api::GetPageRequest {
request_id: req.request_id,
request_class: req.request_class,
rel: req.rel,
@@ -98,16 +98,16 @@ impl GetPageSplitter {
// Construct a response to be populated by shard responses. Preallocate empty page slots
// with the expected block numbers.
let response = GetPageResponse {
let response = page_api::GetPageResponse {
request_id: req.request_id,
status_code: GetPageStatusCode::Ok,
status_code: page_api::GetPageStatusCode::Ok,
reason: None,
rel: req.rel,
pages: req
.block_numbers
.into_iter()
.map(|block_number| {
Page {
page_api::Page {
block_number,
image: Bytes::new(), // empty page slot to be filled in
}
@@ -123,7 +123,9 @@ impl GetPageSplitter {
}
/// Drains the per-shard requests, moving them out of the splitter to avoid extra allocations.
pub fn drain_requests(&mut self) -> impl Iterator<Item = (ShardIndex, GetPageRequest)> {
pub fn drain_requests(
&mut self,
) -> impl Iterator<Item = (ShardIndex, page_api::GetPageRequest)> {
self.requests.drain()
}
@@ -133,10 +135,10 @@ impl GetPageSplitter {
pub fn add_response(
&mut self,
shard_id: ShardIndex,
response: GetPageResponse,
response: page_api::GetPageResponse,
) -> anyhow::Result<()> {
// The caller should already have converted status codes into tonic::Status.
if response.status_code != GetPageStatusCode::Ok {
if response.status_code != page_api::GetPageStatusCode::Ok {
return Err(anyhow!(
"unexpected non-OK response for shard {shard_id}: {} {}",
response.status_code,
@@ -207,7 +209,7 @@ impl GetPageSplitter {
/// Fetches the final, assembled response.
#[allow(clippy::result_large_err)]
pub fn get_response(self) -> anyhow::Result<GetPageResponse> {
pub fn get_response(self) -> anyhow::Result<page_api::GetPageResponse> {
// Check that the response is complete.
for (i, page) in self.response.pages.iter().enumerate() {
if page.image.is_empty() {

View File

@@ -19,9 +19,7 @@ pub mod proto {
}
mod client;
mod model;
mod split;
pub use client::Client;
mod model;
pub use model::*;
pub use split::GetPageSplitter;

View File

@@ -33,8 +33,6 @@ pub enum ProtocolError {
Invalid(&'static str, String),
#[error("required field '{0}' is missing")]
Missing(&'static str),
#[error("invalid combination of not_modified_lsn '{0}' and request_lsn '{1}'")]
InvalidLsns(Lsn, Lsn),
}
impl ProtocolError {
@@ -87,9 +85,9 @@ impl TryFrom<proto::ReadLsn> for ReadLsn {
return Err(ProtocolError::invalid("request_lsn", pb.request_lsn));
}
if pb.not_modified_since_lsn > pb.request_lsn {
return Err(ProtocolError::InvalidLsns(
Lsn(pb.not_modified_since_lsn),
Lsn(pb.request_lsn),
return Err(ProtocolError::invalid(
"not_modified_since_lsn",
pb.not_modified_since_lsn,
));
}
Ok(Self {

View File

@@ -25,9 +25,6 @@ tracing.workspace = true
tokio.workspace = true
tokio-stream.workspace = true
tokio-util.workspace = true
axum.workspace = true
http.workspace = true
metrics.workspace = true
tonic.workspace = true
url.workspace = true

View File

@@ -34,10 +34,6 @@ use crate::util::{request_stats, tokio_thread_local_stats};
/// GetPage@LatestLSN, uniformly distributed across the compute-accessible keyspace.
#[derive(clap::Parser)]
pub(crate) struct Args {
#[clap(long, default_value = "false")]
grpc: bool,
#[clap(long, default_value = "false")]
grpc_stream: bool,
#[clap(long, default_value = "http://localhost:9898")]
mgmt_api_endpoint: String,
/// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
@@ -82,9 +78,6 @@ pub(crate) struct Args {
#[clap(long)]
set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,
#[clap(long)]
only_relnode: Option<u32>,
/// Queue depth generated in each client.
#[clap(long, default_value = "1")]
queue_depth: NonZeroUsize,
@@ -99,31 +92,10 @@ pub(crate) struct Args {
#[clap(long, default_value = "1")]
batch_size: NonZeroUsize,
#[clap(long)]
only_relnode: Option<u32>,
targets: Option<Vec<TenantTimelineId>>,
#[clap(long, default_value = "100")]
pool_max_consumers: NonZeroUsize,
#[clap(long, default_value = "5")]
pool_error_threshold: NonZeroUsize,
#[clap(long, default_value = "5000")]
pool_connect_timeout: NonZeroUsize,
#[clap(long, default_value = "1000")]
pool_connect_backoff: NonZeroUsize,
#[clap(long, default_value = "60000")]
pool_max_idle_duration: NonZeroUsize,
#[clap(long, default_value = "0")]
max_delay_ms: usize,
#[clap(long, default_value = "0")]
percent_drops: usize,
#[clap(long, default_value = "0")]
percent_hangs: usize,
}
/// State shared by all clients
@@ -180,6 +152,7 @@ pub(crate) fn main(args: Args) -> anyhow::Result<()> {
main_impl(args, thread_local_stats)
})
}
async fn main_impl(
args: Args,
all_thread_local_stats: AllThreadLocalStats<request_stats::Stats>,
@@ -344,7 +317,6 @@ async fn main_impl(
let rps_period = args
.per_client_rate
.map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
let ss = shared_state.clone();
let cancel = cancel.clone();

View File

@@ -16,8 +16,7 @@ use anyhow::{Context as _, bail};
use bytes::{Buf as _, BufMut as _, BytesMut};
use chrono::Utc;
use futures::future::BoxFuture;
use futures::stream::FuturesUnordered;
use futures::{FutureExt, Stream, StreamExt as _};
use futures::{FutureExt, Stream};
use itertools::Itertools;
use jsonwebtoken::TokenData;
use once_cell::sync::OnceCell;
@@ -36,8 +35,8 @@ use pageserver_api::pagestream_api::{
};
use pageserver_api::reltag::SlruKind;
use pageserver_api::shard::TenantShardId;
use pageserver_page_api as page_api;
use pageserver_page_api::proto;
use pageserver_page_api::{self as page_api, GetPageSplitter};
use postgres_backend::{
AuthType, PostgresBackend, PostgresBackendReader, QueryError, is_expected_io_error,
};
@@ -444,7 +443,6 @@ impl TimelineHandles {
handles: Default::default(),
}
}
async fn get(
&mut self,
tenant_id: TenantId,
@@ -471,13 +469,6 @@ impl TimelineHandles {
fn tenant_id(&self) -> Option<TenantId> {
self.wrapper.tenant_id.get().copied()
}
/// Returns whether a child shard exists locally for the given shard.
fn has_child_shard(&self, tenant_id: TenantId, shard_index: ShardIndex) -> bool {
self.wrapper
.tenant_manager
.has_child_shard(tenant_id, shard_index)
}
}
pub(crate) struct TenantManagerWrapper {
@@ -3387,9 +3378,17 @@ impl GrpcPageServiceHandler {
}
}
/// Acquires a timeline handle for the given request. The shard index must match a local shard.
/// Acquires a timeline handle for the given request.
///
/// NB: this will fail during shard splits, see comment on [`Self::maybe_split_get_page`].
/// TODO: during shard splits, the compute may still be sending requests to the parent shard
/// until the entire split is committed and the compute is notified. Consider installing a
/// temporary shard router from the parent to the children while the split is in progress.
///
/// TODO: consider moving this to a middleware layer; all requests need it. Needs to manage
/// the TimelineHandles lifecycle.
///
/// TODO: untangle acquisition from TenantManagerWrapper::resolve() and Cache::get(), to avoid
/// the unnecessary overhead.
async fn get_request_timeline(
&self,
req: &tonic::Request<impl Any>,
@@ -3398,62 +3397,11 @@ impl GrpcPageServiceHandler {
let shard_index = *extract::<ShardIndex>(req);
let shard_selector = ShardSelector::Known(shard_index);
// TODO: untangle acquisition from TenantManagerWrapper::resolve() and Cache::get(), to
// avoid the unnecessary overhead.
TimelineHandles::new(self.tenant_manager.clone())
.get(ttid.tenant_id, ttid.timeline_id, shard_selector)
.await
}
/// Acquires a timeline handle for the given request, which must be for shard zero.
///
/// NB: during an ongoing shard split, the compute will keep talking to the parent shard until
/// the split is committed, but the parent shard may have been removed in the meanwhile. In that
/// case, we reroute the request to the new child shard. See [`Self::maybe_split_get_page`].
///
/// TODO: revamp the split protocol to avoid this child routing.
async fn get_shard_zero_request_timeline(
&self,
req: &tonic::Request<impl Any>,
) -> Result<Handle<TenantManagerTypes>, tonic::Status> {
let ttid = *extract::<TenantTimelineId>(req);
let shard_index = *extract::<ShardIndex>(req);
if shard_index.shard_number.0 != 0 {
return Err(tonic::Status::invalid_argument(format!(
"request must use shard zero (requested shard {shard_index})",
)));
}
// TODO: untangle acquisition from TenantManagerWrapper::resolve() and Cache::get(), to
// avoid the unnecessary overhead.
//
// TODO: this does internal retries, which will delay requests during shard splits (we won't
// look for the child until the parent's retries are exhausted). Don't do that.
let mut handles = TimelineHandles::new(self.tenant_manager.clone());
match handles
.get(
ttid.tenant_id,
ttid.timeline_id,
ShardSelector::Known(shard_index),
)
.await
{
Ok(timeline) => Ok(timeline),
Err(err) => {
// We may be in the middle of a shard split. Try to find a child shard 0.
if let Ok(timeline) = handles
.get(ttid.tenant_id, ttid.timeline_id, ShardSelector::Zero)
.await
&& timeline.get_shard_index().shard_count > shard_index.shard_count
{
return Ok(timeline);
}
Err(err.into())
}
}
}
/// Starts a SmgrOpTimer at received_at, throttles the request, and records execution start.
/// Only errors if the timeline is shutting down.
///
@@ -3485,22 +3433,28 @@ impl GrpcPageServiceHandler {
/// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
/// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
/// split them up in the client or server.
#[instrument(skip_all, fields(
req_id = %req.request_id,
rel = %req.rel,
blkno = %req.block_numbers[0],
blks = %req.block_numbers.len(),
lsn = %req.read_lsn,
))]
#[instrument(skip_all, fields(req_id, rel, blkno, blks, req_lsn, mod_lsn))]
async fn get_page(
ctx: &RequestContext,
timeline: Handle<TenantManagerTypes>,
req: page_api::GetPageRequest,
timeline: &WeakHandle<TenantManagerTypes>,
req: proto::GetPageRequest,
io_concurrency: IoConcurrency,
received_at: Instant,
) -> Result<page_api::GetPageResponse, tonic::Status> {
) -> Result<proto::GetPageResponse, tonic::Status> {
let received_at = Instant::now();
let timeline = timeline.upgrade()?;
let ctx = ctx.with_scope_page_service_pagestream(&timeline);
// Validate the request, decorate the span, and convert it to a Pagestream request.
let req = page_api::GetPageRequest::try_from(req)?;
span_record!(
req_id = %req.request_id,
rel = %req.rel,
blkno = %req.block_numbers[0],
blks = %req.block_numbers.len(),
lsn = %req.read_lsn,
);
let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn(); // hold guard
let effective_lsn = PageServerHandler::effective_request_lsn(
&timeline,
@@ -3575,96 +3529,7 @@ impl GrpcPageServiceHandler {
};
}
Ok(resp)
}
/// Processes a GetPage request when there is a potential shard split in progress. We have to
/// reroute the request any local child shards, and split batch requests that straddle multiple
/// child shards.
///
/// Parent shards are split and removed incrementally, but the compute is only notified once the
/// entire split commits, which can take several minutes. In the meanwhile, the compute will be
/// sending requests to the parent shard.
///
/// TODO: add test infrastructure to provoke this situation frequently and for long periods of
/// time, to properly exercise it.
///
/// TODO: revamp the split protocol to avoid this, e.g.:
/// * Keep the parent shard until the split commits and the compute is notified.
/// * Notify the compute about each subsplit.
/// * Return an error that updates the compute's shard map.
#[instrument(skip_all)]
async fn maybe_split_get_page(
ctx: &RequestContext,
handles: &mut TimelineHandles,
ttid: TenantTimelineId,
parent: ShardIndex,
req: page_api::GetPageRequest,
io_concurrency: IoConcurrency,
received_at: Instant,
) -> Result<page_api::GetPageResponse, tonic::Status> {
// Check the first page to see if we have any child shards at all. Otherwise, the compute is
// just talking to the wrong Pageserver. If the parent has been split, the shard now owning
// the page must have a higher shard count.
let timeline = handles
.get(
ttid.tenant_id,
ttid.timeline_id,
ShardSelector::Page(rel_block_to_key(req.rel, req.block_numbers[0])),
)
.await?;
let shard_id = timeline.get_shard_identity();
if shard_id.count <= parent.shard_count {
return Err(HandleUpgradeError::ShutDown.into()); // emulate original error
}
// Fast path: the request fits in a single shard.
if let Some(shard_index) =
GetPageSplitter::for_single_shard(&req, shard_id.count, Some(shard_id.stripe_size))
.map_err(|err| tonic::Status::internal(err.to_string()))?
{
// We got the shard ID from the first page, so these must be equal.
assert_eq!(shard_index.shard_number, shard_id.number);
assert_eq!(shard_index.shard_count, shard_id.count);
return Self::get_page(ctx, timeline, req, io_concurrency, received_at).await;
}
// The request spans multiple shards; split it and dispatch parallel requests. All pages
// were originally in the parent shard, and during a split all children are local, so we
// expect to find local shards for all pages.
let mut splitter = GetPageSplitter::split(req, shard_id.count, Some(shard_id.stripe_size))
.map_err(|err| tonic::Status::internal(err.to_string()))?;
let mut shard_requests = FuturesUnordered::new();
for (shard_index, shard_req) in splitter.drain_requests() {
let timeline = handles
.get(
ttid.tenant_id,
ttid.timeline_id,
ShardSelector::Known(shard_index),
)
.await?;
let future = Self::get_page(
ctx,
timeline,
shard_req,
io_concurrency.clone(),
received_at,
)
.map(move |result| result.map(|resp| (shard_index, resp)));
shard_requests.push(future);
}
while let Some((shard_index, shard_response)) = shard_requests.next().await.transpose()? {
splitter
.add_response(shard_index, shard_response)
.map_err(|err| tonic::Status::internal(err.to_string()))?;
}
splitter
.get_response()
.map_err(|err| tonic::Status::internal(err.to_string()))
Ok(resp.into())
}
}
@@ -3693,10 +3558,11 @@ impl proto::PageService for GrpcPageServiceHandler {
// to be the sweet spot where throughput is saturated.
const CHUNK_SIZE: usize = 256 * 1024;
let timeline = self.get_shard_zero_request_timeline(&req).await?;
let timeline = self.get_request_timeline(&req).await?;
let ctx = self.ctx.with_scope_timeline(&timeline);
// Validate the request and decorate the span.
Self::ensure_shard_zero(&timeline)?;
if timeline.is_archived() == Some(true) {
return Err(tonic::Status::failed_precondition("timeline is archived"));
}
@@ -3812,10 +3678,11 @@ impl proto::PageService for GrpcPageServiceHandler {
req: tonic::Request<proto::GetDbSizeRequest>,
) -> Result<tonic::Response<proto::GetDbSizeResponse>, tonic::Status> {
let received_at = extract::<ReceivedAt>(&req).0;
let timeline = self.get_shard_zero_request_timeline(&req).await?;
let timeline = self.get_request_timeline(&req).await?;
let ctx = self.ctx.with_scope_page_service_pagestream(&timeline);
// Validate the request, decorate the span, and convert it to a Pagestream request.
Self::ensure_shard_zero(&timeline)?;
let req: page_api::GetDbSizeRequest = req.into_inner().try_into()?;
span_record!(db_oid=%req.db_oid, lsn=%req.read_lsn);
@@ -3844,33 +3711,14 @@ impl proto::PageService for GrpcPageServiceHandler {
req: tonic::Request<tonic::Streaming<proto::GetPageRequest>>,
) -> Result<tonic::Response<Self::GetPagesStream>, tonic::Status> {
// Extract the timeline from the request and check that it exists.
//
// NB: during shard splits, the compute may still send requests to the parent shard. We'll
// reroute requests to the child shards below, but we also detect the common cases here
// where either the shard exists or no shards exist at all. If we have a child shard, we
// can't acquire a weak handle because we don't know which child shard to use yet.
//
// TODO: TimelineHandles.get() does internal retries, which will delay requests during shard
// splits. It shouldn't.
let ttid = *extract::<TenantTimelineId>(&req);
let shard_index = *extract::<ShardIndex>(&req);
let shard_selector = ShardSelector::Known(shard_index);
let mut handles = TimelineHandles::new(self.tenant_manager.clone());
let timeline = match handles
.get(
ttid.tenant_id,
ttid.timeline_id,
ShardSelector::Known(shard_index),
)
.await
{
// The timeline shard exists. Keep a weak handle to reuse for each request.
Ok(timeline) => Some(timeline.downgrade()),
// The shard doesn't exist, but a child shard does. We'll reroute requests later.
Err(_) if handles.has_child_shard(ttid.tenant_id, shard_index) => None,
// Failed to fetch the timeline, and no child shard exists. Error out.
Err(err) => return Err(err.into()),
};
handles
.get(ttid.tenant_id, ttid.timeline_id, shard_selector)
.await?;
// Spawn an IoConcurrency sidecar, if enabled.
let gate_guard = self
@@ -3887,9 +3735,11 @@ impl proto::PageService for GrpcPageServiceHandler {
let mut reqs = req.into_inner();
let resps = async_stream::try_stream! {
let timeline = handles
.get(ttid.tenant_id, ttid.timeline_id, shard_selector)
.await?
.downgrade();
loop {
// Wait for the next client request.
//
// NB: Tonic considers the entire stream to be an in-flight request and will wait
// for it to complete before shutting down. React to cancellation between requests.
let req = tokio::select! {
@@ -3902,43 +3752,16 @@ impl proto::PageService for GrpcPageServiceHandler {
Err(err) => Err(err),
},
}?;
let received_at = Instant::now();
let req_id = req.request_id.map(page_api::RequestID::from).unwrap_or_default();
// Process the request, using a closure to capture errors.
let process_request = async || {
let req = page_api::GetPageRequest::try_from(req)?;
// Fast path: use the pre-acquired timeline handle.
if let Some(Ok(timeline)) = timeline.as_ref().map(|t| t.upgrade()) {
return Self::get_page(&ctx, timeline, req, io_concurrency.clone(), received_at)
.instrument(span.clone()) // propagate request span
.await
}
// The timeline handle is stale. During shard splits, the compute may still be
// sending requests to the parent shard. Try to re-route requests to the child
// shards, and split any batch requests that straddle multiple child shards.
Self::maybe_split_get_page(
&ctx,
&mut handles,
ttid,
shard_index,
req,
io_concurrency.clone(),
received_at,
)
let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
.instrument(span.clone()) // propagate request span
.await
};
// Return the response. Convert per-request errors to GetPageResponses if
// appropriate, or terminate the stream with a tonic::Status.
yield match process_request().await {
Ok(resp) => resp.into(),
.await;
yield match result {
Ok(resp) => resp,
// Convert per-request errors to GetPageResponses as appropriate, or terminate
// the stream with a tonic::Status. Log the error regardless, since
// ObservabilityLayer can't automatically log stream errors.
Err(status) => {
// Log the error, since ObservabilityLayer won't see stream errors.
// TODO: it would be nice if we could propagate the get_page() fields here.
span.in_scope(|| {
warn!("request failed with {:?}: {}", status.code(), status.message());
@@ -3958,10 +3781,11 @@ impl proto::PageService for GrpcPageServiceHandler {
req: tonic::Request<proto::GetRelSizeRequest>,
) -> Result<tonic::Response<proto::GetRelSizeResponse>, tonic::Status> {
let received_at = extract::<ReceivedAt>(&req).0;
let timeline = self.get_shard_zero_request_timeline(&req).await?;
let timeline = self.get_request_timeline(&req).await?;
let ctx = self.ctx.with_scope_page_service_pagestream(&timeline);
// Validate the request, decorate the span, and convert it to a Pagestream request.
Self::ensure_shard_zero(&timeline)?;
let req: page_api::GetRelSizeRequest = req.into_inner().try_into()?;
let allow_missing = req.allow_missing;
@@ -3994,7 +3818,7 @@ impl proto::PageService for GrpcPageServiceHandler {
req: tonic::Request<proto::GetSlruSegmentRequest>,
) -> Result<tonic::Response<proto::GetSlruSegmentResponse>, tonic::Status> {
let received_at = extract::<ReceivedAt>(&req).0;
let timeline = self.get_shard_zero_request_timeline(&req).await?;
let timeline = self.get_request_timeline(&req).await?;
let ctx = self.ctx.with_scope_page_service_pagestream(&timeline);
// Validate the request, decorate the span, and convert it to a Pagestream request.
@@ -4028,10 +3852,6 @@ impl proto::PageService for GrpcPageServiceHandler {
&self,
req: tonic::Request<proto::LeaseLsnRequest>,
) -> Result<tonic::Response<proto::LeaseLsnResponse>, tonic::Status> {
// TODO: this won't work during shard splits, as the request is directed at a specific shard
// but the parent shard is removed before the split commits and the compute is notified
// (which can take several minutes for large tenants). That's also the case for the libpq
// implementation, so we keep the behavior for now.
let timeline = self.get_request_timeline(&req).await?;
let ctx = self.ctx.with_scope_timeline(&timeline);

View File

@@ -826,18 +826,6 @@ impl TenantManager {
peek_slot.is_some()
}
/// Returns whether a local slot exists for a child shard of the given tenant and shard count.
/// Note that this just checks for a shard with a larger shard count, and it may not be a
/// direct child of the given shard.
pub(crate) fn has_child_shard(&self, tenant_id: TenantId, shard_index: ShardIndex) -> bool {
match &*self.tenants.read().unwrap() {
TenantsMap::Initializing => false,
TenantsMap::Open(slots) | TenantsMap::ShuttingDown(slots) => slots
.range(TenantShardId::tenant_range(tenant_id))
.any(|(tsid, _)| tsid.shard_count > shard_index.shard_count),
}
}
#[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
pub(crate) async fn upsert_location(
&self,

View File

@@ -5,12 +5,10 @@ MODULE_big = neon
OBJS = \
$(WIN32RES) \
communicator.o \
communicator_new.o \
communicator_process.o \
extension_server.o \
file_cache.o \
hll.o \
lfc_prewarm.o \
libpagestore.o \
logical_replication_monitor.o \
neon.o \
@@ -65,7 +63,6 @@ WALPROP_OBJS = \
# libcommunicator.a is built by cargo from the Rust sources under communicator/
# subdirectory. `cargo build` also generates communicator_bindings.h.
communicator_new.o: communicator/communicator_bindings.h
communicator_process.o: communicator/communicator_bindings.h
file_cache.o: communicator/communicator_bindings.h

View File

@@ -1,372 +0,0 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "addr2line"
version = "0.24.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1"
dependencies = [
"gimli",
]
[[package]]
name = "adler2"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
[[package]]
name = "backtrace"
version = "0.3.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8d82cb332cdfaed17ae235a638438ac4d4839913cc2af585c3c6746e8f8bee1a"
dependencies = [
"addr2line",
"cfg-if",
"libc",
"miniz_oxide",
"object",
"rustc-demangle",
"windows-targets",
]
[[package]]
name = "base64"
version = "0.22.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
[[package]]
name = "bytes"
version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "communicator"
version = "0.1.0"
dependencies = [
"tonic",
]
[[package]]
name = "fnv"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
[[package]]
name = "futures-core"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
[[package]]
name = "gimli"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
[[package]]
name = "http"
version = "1.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565"
dependencies = [
"bytes",
"fnv",
"itoa",
]
[[package]]
name = "http-body"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
dependencies = [
"bytes",
"http",
]
[[package]]
name = "http-body-util"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
dependencies = [
"bytes",
"futures-core",
"http",
"http-body",
"pin-project-lite",
]
[[package]]
name = "itoa"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]]
name = "libc"
version = "0.2.171"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c19937216e9d3aa9956d9bb8dfc0b0c8beb6058fc4f7a4dc4d850edf86a237d6"
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "miniz_oxide"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff70ce3e48ae43fa075863cef62e8b43b71a4f2382229920e0df362592919430"
dependencies = [
"adler2",
]
[[package]]
name = "object"
version = "0.36.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87"
dependencies = [
"memchr",
]
[[package]]
name = "once_cell"
version = "1.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
[[package]]
name = "percent-encoding"
version = "2.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
[[package]]
name = "pin-project"
version = "1.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a"
dependencies = [
"pin-project-internal",
]
[[package]]
name = "pin-project-internal"
version = "1.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "pin-project-lite"
version = "0.2.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
[[package]]
name = "proc-macro2"
version = "1.0.94"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rustc-demangle"
version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
[[package]]
name = "syn"
version = "2.0.100"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "tokio"
version = "1.44.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6b88822cbe49de4185e3a4cbf8321dd487cf5fe0c5c65695fef6346371e9c48"
dependencies = [
"backtrace",
"pin-project-lite",
]
[[package]]
name = "tokio-stream"
version = "0.1.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047"
dependencies = [
"futures-core",
"pin-project-lite",
"tokio",
]
[[package]]
name = "tonic"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85839f0b32fd242bb3209262371d07feda6d780d16ee9d2bc88581b89da1549b"
dependencies = [
"base64",
"bytes",
"http",
"http-body",
"http-body-util",
"percent-encoding",
"pin-project",
"tokio-stream",
"tower-layer",
"tower-service",
"tracing",
]
[[package]]
name = "tower-layer"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
[[package]]
name = "tower-service"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
[[package]]
name = "tracing"
version = "0.1.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
dependencies = [
"pin-project-lite",
"tracing-attributes",
"tracing-core",
]
[[package]]
name = "tracing-attributes"
version = "0.1.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "tracing-core"
version = "0.1.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
dependencies = [
"once_cell",
]
[[package]]
name = "unicode-ident"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"

View File

@@ -1,8 +1,12 @@
[package]
name = "communicator"
version = "0.1.0"
license.workspace = true
edition.workspace = true
[lib]
crate-type = ["staticlib"]
[features]
# 'testing' feature is currently unused in the communicator, but we accept it for convenience of
# calling build scripts, so that you can pass the same feature to all packages.
@@ -11,36 +15,14 @@ testing = []
# calling build scripts, so that you can pass the same feature to all packages.
rest_broker = []
[lib]
crate-type = ["staticlib"]
[dependencies]
axum.workspace = true
bytes.workspace = true
clashmap.workspace = true
http.workspace = true
libc.workspace = true
nix.workspace = true
atomic_enum = "0.3.0"
measured.workspace = true
prometheus.workspace = true
prost.workspace = true
strum_macros.workspace = true
thiserror.workspace = true
tonic = { workspace = true, default-features = false, features=["codegen", "prost", "transport"] }
tokio = { workspace = true, features = ["macros", "net", "io-util", "rt", "rt-multi-thread"] }
tokio-pipe = { version = "0.2.12" }
tracing.workspace = true
tracing-subscriber.workspace = true
metrics.workspace = true
uring-common = { workspace = true, features = ["bytes"] }
pageserver_client_grpc.workspace = true
pageserver_api.workspace = true
pageserver_page_api.workspace = true
neon-shmem.workspace = true
measured.workspace = true
utils.workspace = true
workspace_hack = { version = "0.1", path = "../../../workspace_hack" }

View File

@@ -3,18 +3,9 @@
This package provides the so-called "compute-pageserver communicator",
or just "communicator" in short. The communicator is a separate
background worker process that runs in the PostgreSQL server. It's
part of the neon extension.
The commuicator handles the communication with the pageservers, and
also provides an HTTP endpoint for metrics over a local Unix Domain
socket (aka. the "communicator control socket"). On the PostgreSQL
side, the glue code in pgxn/neon/ uses the communicator to implement
the PostgreSQL Storage Manager (SMGR) interface.
## Design criteria
- Low latency
- Saturate a 10 Gbit / s network interface without becoming a bottleneck
part of the neon extension. Currently, it only provides an HTTP
endpoint for metrics, but in the future it will evolve to handle all
communications with the pageservers.
## Source code view
@@ -23,122 +14,10 @@ pgxn/neon/communicator_process.c
the glue that interacts with PostgreSQL code and the Rust
code in the communicator process.
pgxn/neon/communicator_new.c
Contains the backend code that interacts with the communicator
process.
pgxn/neon/communicator/src/backend_interface.rs
The entry point for calls from each backend.
pgxn/neon/communicator/src/init.rs
Initialization at server startup
pgxn/neon/communicator/src/worker_process/
Worker process main loop and glue code
At compilation time, pgxn/neon/communicator/ produces a static
library, libcommunicator.a. It is linked to the neon.so extension
library.
The real networking code, which is independent of PostgreSQL, is in
the pageserver/client_grpc crate.
## Process view
The communicator runs in a dedicated background worker process, the
"communicator process". The communicator uses a multi-threaded Tokio
runtime to execute the IO requests. So the communicator process has
multiple threads running. That's unusual for Postgres processes and
care must be taken to make that work.
### Backend <-> worker communication
Each backend has a number of I/O request slots in shared memory. The
slots are statically allocated for each backend, and must not be
accessed by other backends. The worker process reads requests from the
shared memory slots, and writes responses back to the slots.
Here's an example snapshot of the system, when two requests from two
different backends are in progress:
```
Backends Request slots Communicator process
--------- ------------- --------------------
Backend 1 1: Idle
2: Idle
3: Processing tokio task handling request 3
Backend 2 4: Completed
5: Processing tokio task handling request 5
6: Idle
... ...
```
To submit an IO request, the backend first picks one of its Idle
slots, writes the IO request in the slot, and updates it to
'Submitted' state. That transfers the ownership of the slot to the
worker process, until the worker process marks the request as
Completed. The worker process spawns a separate Tokio task for each
request.
To inform the worker process that a request slot has a pending IO
request, there's a pipe shared by the worker process and all backend
processes. The backend writes the index of the request slot to the
pipe after changing the slot's state to Submitted. This wakes up the
worker process.
(Note that the pipe is just used for wakeups, but the worker process
is free to pick up Submitted IO requests even without receiving the
wakeup. As of this writing, it doesn't do that, but it might be useful
in the future to reduce latency even further, for example.)
When the worker process has completed processing the request, it
writes the result back in the request slot. A GetPage request can also
contain a pointer to buffer in the shared buffer cache. In that case,
the worker process writes the resulting page contents directly to the
buffer, and just a result code in the request slot. It then updates
the 'state' field to Completed, which passes the owner ship back to
the originating backend. Finally, it signals the process Latch of the
originating backend, waking it up.
### Differences between PostgreSQL v16, v17 and v18
PostgreSQL v18 introduced the new AIO mechanism. The PostgreSQL AIO
mechanism uses a very similar mechanism as described in the previous
section, for the communication between AIO worker processes and
backends. With our communicator, the AIO worker processes are not
used, but we use the same PgAioHandle request slots as in upstream.
For Neon-specific IO requests like GetDbSize, a neon request slot is
used. But for the actual IO requests, the request slot merely contains
a pointer to the PgAioHandle slot. The worker process updates the
status of that, calls the IO callbacks upon completionetc, just like
the upstream AIO worker processes do.
## Sequence diagram
neon
PostgreSQL extension backend_interface.rs worker_process.rs processor tonic
| . . . .
| smgr_read() . . . .
+-------------> + . . .
. | . . .
. | rcommunicator_ . . .
. | get_page_at_lsn . . .
. +------------------> + . .
| . .
| write request to . . .
| slot . .
| . .
| . .
| submit_request() . .
+-----------------> + .
| | .
| | db_size_request . .
+---------------->.
. TODO
### Compute <-> pageserver protocol
The protocol between Compute and the pageserver is based on gRPC. See `protos/`.

View File

@@ -1,224 +0,0 @@
//! This module implements a request/response "slot" for submitting
//! requests from backends to the communicator process.
//!
//! NB: The "backend" side of this code runs in Postgres backend processes,
//! which means that it is not safe to use the 'tracing' crate for logging, nor
//! to launch threads or use tokio tasks!
use std::cell::UnsafeCell;
use std::sync::atomic::{AtomicI32, Ordering};
use crate::neon_request::{NeonIORequest, NeonIOResult};
use atomic_enum::atomic_enum;
/// One request/response slot. Each backend has its own set of slots that it
/// uses.
///
/// This is the moral equivalent of PgAioHandle for Postgres AIO requests
/// Like PgAioHandle, try to keep this small.
///
/// There is an array of these in shared memory. Therefore, this must be Sized.
///
/// ## Lifecycle of a request
///
/// A slot is always owned by either the backend process or the communicator
/// process, depending on the 'state'. Only the owning process is allowed to
/// read or modify the slot, except for reading the 'state' itself to check who
/// owns it.
///
/// A slot begins in the Idle state, where it is owned by the backend process.
/// To submit a request, the backend process fills the slot with the request
/// data, and changes it to the Submitted state. After changing the state, the
/// slot is owned by the communicator process, and the backend is not allowed
/// to access it until the communicator process marks it as Completed.
///
/// When the communicator process sees that the slot is in Submitted state, it
/// starts to process the request. After processing the request, it stores the
/// result in the slot, and changes the state to Completed. It is now owned by
/// the backend process again, which may now read the result, and reuse the
/// slot for a new request.
///
/// For correctness of the above protocol, we really only need two states:
/// "owned by backend" and "owned by communicator process". But to help with
/// debugging and better assertions, there are a few more states. When the
/// backend starts to fill in the request details in the slot, it first sets the
/// state from Idle to Filling, and when it's done with that, from Filling to
/// Submitted. In the Filling state, the slot is still owned by the
/// backend. Similarly, when the communicator process starts to process a
/// request, it sets it to Processing state first, but the slot is still owned
/// by the communicator process.
///
/// This struct doesn't handle waking up the communicator process when a request
/// has been submitted or when a response is ready. The 'owner_procno' is used
/// for waking up the backend on completion, but that happens elsewhere.
pub struct NeonIORequestSlot {
/// similar to PgAioHandleState
state: AtomicNeonIORequestSlotState,
/// The owning process's ProcNumber. The worker process uses this to set the
/// process's latch on completion.
///
/// (This could be calculated from num_neon_request_slots_per_backend and
/// the index of this slot in the overall 'neon_requst_slots array'. But we
/// prefer the communicator process to not know how the request slots are
/// divided between the backends.)
owner_procno: AtomicI32,
/// SAFETY: This is modified by submit_request(), after it has established
/// ownership of the slot by setting state from Idle to Filling
request: UnsafeCell<NeonIORequest>,
/// Valid when state is Completed
///
/// SAFETY: This is modified by RequestProcessingGuard::complete(). There
/// can be only one RequestProcessingGuard outstanding for a slot at a time,
/// because it is returned by start_processing_request() which checks the
/// state, so RequestProcessingGuard has exclusive access to the slot.
result: UnsafeCell<NeonIOResult>,
}
// The protocol described in the "Lifecycle of a request" section above ensures
// the safe access to the fields
unsafe impl Send for NeonIORequestSlot {}
unsafe impl Sync for NeonIORequestSlot {}
impl Default for NeonIORequestSlot {
fn default() -> NeonIORequestSlot {
NeonIORequestSlot {
owner_procno: AtomicI32::new(-1),
request: UnsafeCell::new(NeonIORequest::Empty),
result: UnsafeCell::new(NeonIOResult::Empty),
state: AtomicNeonIORequestSlotState::new(NeonIORequestSlotState::Idle),
}
}
}
#[atomic_enum]
#[derive(Eq, PartialEq)]
pub enum NeonIORequestSlotState {
Idle,
/// Backend is filling in the request
Filling,
/// Backend has submitted the request to the communicator, but the
/// communicator process has not yet started processing it.
Submitted,
/// Communicator is processing the request
Processing,
/// Communicator has completed the request, and the 'result' field is now
/// valid, but the backend has not read the result yet.
Completed,
}
impl NeonIORequestSlot {
/// Write a request to the slot, and mark it as Submitted.
///
/// Note: This does not wake up the worker process to actually process
/// the request. It's the caller's responsibility to do that.
pub fn submit_request(&self, request: &NeonIORequest, proc_number: i32) {
// Verify that the slot is in Idle state previously, and put it in
// Filling state.
//
// XXX: This step isn't strictly necessary. Assuming the caller didn't
// screw up and try to use a slot that's already in use, we could fill
// the slot and switch it directly from Idle to Submitted state.
if let Err(s) = self.state.compare_exchange(
NeonIORequestSlotState::Idle,
NeonIORequestSlotState::Filling,
Ordering::Relaxed,
Ordering::Relaxed,
) {
panic!("unexpected state in request slot: {s:?}");
}
// Fill in the request details
self.owner_procno.store(proc_number, Ordering::Relaxed);
unsafe { *self.request.get() = *request }
// This synchronizes-with store/swap in [`start_processing_request`].
// Note that this ensures that the previous non-atomic writes visible
// to other threads too.
self.state
.store(NeonIORequestSlotState::Submitted, Ordering::Release);
}
pub fn get_state(&self) -> NeonIORequestSlotState {
self.state.load(Ordering::Relaxed)
}
pub fn try_get_result(&self) -> Option<NeonIOResult> {
// This synchronizes-with the store/swap in [`RequestProcessingGuard::completed`]
let state = self.state.load(Ordering::Acquire);
if state == NeonIORequestSlotState::Completed {
let result = unsafe { *self.result.get() };
self.state
.store(NeonIORequestSlotState::Idle, Ordering::Relaxed);
Some(result)
} else {
None
}
}
/// Read the IO request from the slot indicated in the wakeup
pub fn start_processing_request<'a>(&'a self) -> Option<RequestProcessingGuard<'a>> {
// XXX: using atomic load rather than compare_exchange would be
// sufficient here, as long as the communicator process has _some_ means
// of tracking which requests it's already processing. That could be a
// flag somewhere in communicator's private memory, for example.
//
// This synchronizes-with the store in [`submit_request`].
if let Err(s) = self.state.compare_exchange(
NeonIORequestSlotState::Submitted,
NeonIORequestSlotState::Processing,
Ordering::Acquire,
Ordering::Relaxed,
) {
// FIXME surprising state. This is unexpected at the moment, but if we
// started to process requests more aggressively, without waiting for the
// read from the pipe, then this could happen
panic!("unexpected state in request slot: {s:?}");
}
Some(RequestProcessingGuard(self))
}
}
/// [`NeonIORequestSlot::start_processing_request`] returns this guard object to
/// indicate that the the caller now "owns" the slot, until it calls
/// [`RequestProcessingGuard::completed`].
///
/// TODO: implement Drop on this, to mark the request as Aborted or Errored
/// if [`RequestProcessingGuard::completed`] is not called.
pub struct RequestProcessingGuard<'a>(&'a NeonIORequestSlot);
unsafe impl<'a> Send for RequestProcessingGuard<'a> {}
unsafe impl<'a> Sync for RequestProcessingGuard<'a> {}
impl<'a> RequestProcessingGuard<'a> {
pub fn get_request(&self) -> &NeonIORequest {
unsafe { &*self.0.request.get() }
}
pub fn get_owner_procno(&self) -> i32 {
self.0.owner_procno.load(Ordering::Relaxed)
}
pub fn completed(self, result: NeonIOResult) {
// Store the result to the slot.
unsafe {
*self.0.result.get() = result;
};
// Mark the request as completed. After that, we no longer have
// ownership of the slot, and must not modify it.
let old_state = self
.0
.state
.swap(NeonIORequestSlotState::Completed, Ordering::Release);
assert!(old_state == NeonIORequestSlotState::Processing);
}
}

View File

@@ -1,283 +0,0 @@
//! This code runs in each backend process. That means that launching Rust threads, panicking
//! etc. is forbidden!
use std::os::fd::OwnedFd;
use crate::backend_comms::NeonIORequestSlot;
use crate::init::CommunicatorInitStruct;
use crate::integrated_cache::{BackendCacheReadOp, IntegratedCacheReadAccess};
use crate::neon_request::{CCachedGetPageVResult, COid};
use crate::neon_request::{NeonIORequest, NeonIOResult};
pub struct CommunicatorBackendStruct<'t> {
my_proc_number: i32,
neon_request_slots: &'t [NeonIORequestSlot],
submission_pipe_write_fd: OwnedFd,
pending_cache_read_op: Option<BackendCacheReadOp<'t>>,
integrated_cache: &'t IntegratedCacheReadAccess<'t>,
}
#[unsafe(no_mangle)]
pub extern "C" fn rcommunicator_backend_init(
cis: Box<CommunicatorInitStruct>,
my_proc_number: i32,
) -> &'static mut CommunicatorBackendStruct<'static> {
if my_proc_number < 0 {
panic!("cannot attach to communicator shared memory with procnumber {my_proc_number}");
}
let integrated_cache = Box::leak(Box::new(cis.integrated_cache_init_struct.backend_init()));
let bs: &'static mut CommunicatorBackendStruct =
Box::leak(Box::new(CommunicatorBackendStruct {
my_proc_number,
neon_request_slots: cis.neon_request_slots,
submission_pipe_write_fd: cis.submission_pipe_write_fd,
pending_cache_read_op: None,
integrated_cache,
}));
bs
}
/// Start a request. You can poll for its completion and get the result by
/// calling bcomm_poll_dbsize_request_completion(). The communicator will wake
/// us up by setting our process latch, so to wait for the completion, wait on
/// the latch and call bcomm_poll_dbsize_request_completion() every time the
/// latch is set.
///
/// Safety: The C caller must ensure that the references are valid.
/// The requested slot must be free, or this panics.
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_start_io_request(
bs: &'_ mut CommunicatorBackendStruct,
slot_idx: i32,
request: &NeonIORequest,
immediate_result_ptr: &mut NeonIOResult,
) -> i32 {
assert!(bs.pending_cache_read_op.is_none());
// Check if the request can be satisfied from the cache first
if let NeonIORequest::RelSize(req) = request {
if let Some(nblocks) = bs.integrated_cache.get_rel_size(&req.reltag()) {
*immediate_result_ptr = NeonIOResult::RelSize(nblocks);
return -1;
}
}
// Create neon request and submit it
bs.start_neon_io_request(slot_idx, request);
slot_idx
}
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_start_get_page_v_request(
bs: &mut CommunicatorBackendStruct,
slot_idx: i32,
request: &NeonIORequest,
immediate_result_ptr: &mut CCachedGetPageVResult,
) -> i32 {
let NeonIORequest::GetPageV(get_pagev_request) = request else {
panic!("invalid request passed to bcomm_start_get_page_v_request()");
};
assert!(matches!(request, NeonIORequest::GetPageV(_)));
assert!(bs.pending_cache_read_op.is_none());
// Check if the request can be satisfied from the cache first
let mut all_cached = true;
let mut read_op = bs.integrated_cache.start_read_op();
for i in 0..get_pagev_request.nblocks {
if let Some(cache_block) = read_op.get_page(
&get_pagev_request.reltag(),
get_pagev_request.block_number + i as u32,
) {
immediate_result_ptr.cache_block_numbers[i as usize] = cache_block;
} else {
// not found in cache
all_cached = false;
break;
}
}
if all_cached {
bs.pending_cache_read_op = Some(read_op);
return -1;
}
// Create neon request and submit it
bs.start_neon_io_request(slot_idx, request);
slot_idx
}
/// Check if a request has completed. Returns:
///
/// -1 if the request is still being processed
/// 0 on success
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_poll_request_completion(
bs: &mut CommunicatorBackendStruct,
request_slot_idx: u32,
result_p: &mut NeonIOResult,
) -> i32 {
match bs.neon_request_slots[request_slot_idx as usize].try_get_result() {
None => -1, // still processing
Some(result) => {
*result_p = result;
0
}
}
}
/// Check if a request has completed. Returns:
///
/// 'false' if the slot is Idle. The backend process has ownership.
/// 'true' if the slot is busy, and should be polled for result.
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_get_request_slot_status(
bs: &mut CommunicatorBackendStruct,
request_slot_idx: u32,
) -> bool {
use crate::backend_comms::NeonIORequestSlotState;
match bs.neon_request_slots[request_slot_idx as usize].get_state() {
NeonIORequestSlotState::Idle => false,
NeonIORequestSlotState::Filling => {
// 'false' would be the right result here. However, this
// is a very transient state. The C code should never
// leave a slot in this state, so if it sees that,
// something's gone wrong and it's not clear what to do
// with it.
panic!("unexpected Filling state in request slot {request_slot_idx}");
}
NeonIORequestSlotState::Submitted => true,
NeonIORequestSlotState::Processing => true,
NeonIORequestSlotState::Completed => true,
}
}
// LFC functions
/// Finish a local file cache read
///
//
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_finish_cache_read(bs: &mut CommunicatorBackendStruct) -> bool {
if let Some(op) = bs.pending_cache_read_op.take() {
op.finish()
} else {
panic!("bcomm_finish_cache_read() called with no cached read pending");
}
}
/// Check if the local file cache contians the given block
#[unsafe(no_mangle)]
pub extern "C" fn bcomm_cache_contains(
bs: &mut CommunicatorBackendStruct,
spc_oid: COid,
db_oid: COid,
rel_number: u32,
fork_number: u8,
block_number: u32,
) -> bool {
bs.integrated_cache.cache_contains_page(
&pageserver_page_api::RelTag {
spcnode: spc_oid,
dbnode: db_oid,
relnode: rel_number,
forknum: fork_number,
},
block_number,
)
}
#[repr(C)]
#[derive(Clone, Debug)]
pub struct FileCacheIterator {
next_bucket: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
}
/// Iterate over LFC contents
#[allow(clippy::missing_safety_doc)]
#[unsafe(no_mangle)]
pub unsafe extern "C" fn bcomm_cache_iterate_begin(
_bs: &mut CommunicatorBackendStruct,
iter: *mut FileCacheIterator,
) {
unsafe { (*iter).next_bucket = 0 };
}
#[allow(clippy::missing_safety_doc)]
#[unsafe(no_mangle)]
pub unsafe extern "C" fn bcomm_cache_iterate_next(
bs: &mut CommunicatorBackendStruct,
iter: *mut FileCacheIterator,
) -> bool {
use crate::integrated_cache::GetBucketResult;
loop {
let next_bucket = unsafe { (*iter).next_bucket } as usize;
match bs.integrated_cache.get_bucket(next_bucket) {
GetBucketResult::Occupied(rel, blk) => {
unsafe {
(*iter).spc_oid = rel.spcnode;
(*iter).db_oid = rel.dbnode;
(*iter).rel_number = rel.relnode;
(*iter).fork_number = rel.forknum;
(*iter).block_number = blk;
(*iter).next_bucket += 1;
}
break true;
}
GetBucketResult::Vacant => {
unsafe {
(*iter).next_bucket += 1;
}
continue;
}
GetBucketResult::OutOfBounds => {
break false;
}
}
}
}
impl<'t> CommunicatorBackendStruct<'t> {
/// The slot must be free, or this panics.
pub(crate) fn start_neon_io_request(&mut self, request_slot_idx: i32, request: &NeonIORequest) {
let my_proc_number = self.my_proc_number;
self.neon_request_slots[request_slot_idx as usize].submit_request(request, my_proc_number);
// Tell the communicator about it
self.notify_about_request(request_slot_idx);
}
/// Send a wakeup to the communicator process
fn notify_about_request(self: &CommunicatorBackendStruct<'t>, request_slot_idx: i32) {
// wake up communicator by writing the idx to the submission pipe
//
// This can block, if the pipe is full. That should be very rare,
// because the communicator tries hard to drain the pipe to prevent
// that. Also, there's a natural upper bound on how many wakeups can be
// queued up: there is only a limited number of request slots for each
// backend.
//
// If it does block very briefly, that's not too serious.
let idxbuf = request_slot_idx.to_ne_bytes();
let _res = nix::unistd::write(&self.submission_pipe_write_fd, &idxbuf);
// FIXME: check result, return any errors
}
}

View File

@@ -1,162 +0,0 @@
//! Implement the "low-level" parts of the file cache.
//!
//! This module just deals with reading and writing the file, and keeping track
//! which blocks in the cache file are in use and which are free. The "high
//! level" parts of tracking which block in the cache file corresponds to which
//! relation block is handled in 'integrated_cache' instead.
//!
//! This module is only used to access the file from the communicator
//! process. The backend processes *also* read the file (and sometimes also
//! write it? ), but the backends use direct C library calls for that.
use std::fs::File;
use std::os::unix::fs::FileExt;
use std::path::Path;
use std::sync::Arc;
use std::sync::Mutex;
use crate::BLCKSZ;
use tokio::task::spawn_blocking;
pub type CacheBlock = u64;
pub const INVALID_CACHE_BLOCK: CacheBlock = u64::MAX;
#[derive(Debug)]
pub struct FileCache {
file: Arc<File>,
free_list: Mutex<FreeList>,
// metrics
max_blocks_gauge: metrics::IntGauge,
num_free_blocks_gauge: metrics::IntGauge,
}
// TODO: We keep track of all free blocks in this vec. That doesn't really scale.
// Idea: when free_blocks fills up with more than 1024 entries, write them all to
// one block on disk.
#[derive(Debug)]
struct FreeList {
next_free_block: CacheBlock,
max_blocks: u64,
free_blocks: Vec<CacheBlock>,
}
impl FileCache {
pub fn new(file_cache_path: &Path, mut initial_size: u64) -> Result<FileCache, std::io::Error> {
if initial_size < 100 {
tracing::warn!(
"min size for file cache is 100 blocks, {} requested",
initial_size
);
initial_size = 100;
}
let file = std::fs::OpenOptions::new()
.read(true)
.write(true)
.truncate(true)
.create(true)
.open(file_cache_path)?;
let max_blocks_gauge = metrics::IntGauge::new(
"file_cache_max_blocks",
"Local File Cache size in 8KiB blocks",
)
.unwrap();
let num_free_blocks_gauge = metrics::IntGauge::new(
"file_cache_num_free_blocks",
"Number of free 8KiB blocks in Local File Cache",
)
.unwrap();
tracing::info!("initialized file cache with {} blocks", initial_size);
Ok(FileCache {
file: Arc::new(file),
free_list: Mutex::new(FreeList {
next_free_block: 0,
max_blocks: initial_size,
free_blocks: Vec::new(),
}),
max_blocks_gauge,
num_free_blocks_gauge,
})
}
// File cache management
pub async fn read_block(
&self,
cache_block: CacheBlock,
mut dst: impl uring_common::buf::IoBufMut + Send + Sync,
) -> Result<(), std::io::Error> {
assert!(dst.bytes_total() == BLCKSZ);
let file = self.file.clone();
let dst_ref = unsafe { std::slice::from_raw_parts_mut(dst.stable_mut_ptr(), BLCKSZ) };
spawn_blocking(move || file.read_exact_at(dst_ref, cache_block * BLCKSZ as u64)).await??;
Ok(())
}
pub async fn write_block(
&self,
cache_block: CacheBlock,
src: impl uring_common::buf::IoBuf + Send + Sync,
) -> Result<(), std::io::Error> {
assert!(src.bytes_init() == BLCKSZ);
let file = self.file.clone();
let src_ref = unsafe { std::slice::from_raw_parts(src.stable_ptr(), BLCKSZ) };
spawn_blocking(move || file.write_all_at(src_ref, cache_block * BLCKSZ as u64)).await??;
Ok(())
}
pub fn alloc_block(&self) -> Option<CacheBlock> {
let mut free_list = self.free_list.lock().unwrap();
if let Some(x) = free_list.free_blocks.pop() {
return Some(x);
}
if free_list.next_free_block < free_list.max_blocks {
let result = free_list.next_free_block;
free_list.next_free_block += 1;
return Some(result);
}
None
}
pub fn dealloc_block(&self, cache_block: CacheBlock) {
let mut free_list = self.free_list.lock().unwrap();
free_list.free_blocks.push(cache_block);
}
}
impl metrics::core::Collector for FileCache {
fn desc(&self) -> Vec<&metrics::core::Desc> {
let mut descs = Vec::new();
descs.append(&mut self.max_blocks_gauge.desc());
descs.append(&mut self.num_free_blocks_gauge.desc());
descs
}
fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
// Update the gauges with fresh values first
{
let free_list = self.free_list.lock().unwrap();
self.max_blocks_gauge.set(free_list.max_blocks as i64);
let total_free_blocks: i64 = free_list.free_blocks.len() as i64
+ (free_list.max_blocks as i64 - free_list.next_free_block as i64);
self.num_free_blocks_gauge.set(total_free_blocks);
}
let mut values = Vec::new();
values.append(&mut self.max_blocks_gauge.collect());
values.append(&mut self.num_free_blocks_gauge.collect());
values
}
}

View File

@@ -1,109 +0,0 @@
//! Global allocator, for tracking memory usage of the Rust parts
//!
//! Postgres is designed to handle allocation failure (ie. malloc() returning NULL) gracefully. It
//! rolls backs the transaction and gives the user an "ERROR: out of memory" error. Rust code
//! however panics if an allocation fails. We don't want that to ever happen, because an unhandled
//! panic leads to Postgres crash and restart. Our strategy is to pre-allocate a large enough chunk
//! of memory for use by the Rust code, so that the allocations never fail.
//!
//! To pick the size for the pre-allocated chunk, we have a metric to track the high watermark
//! memory usage of all the Rust allocations in total.
//!
//! TODO:
//!
//! - Currently we just export the metrics. Actual allocations are still just passed through to
//! the system allocator.
//! - Take padding etc. overhead into account
use std::alloc::{GlobalAlloc, Layout, System};
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
use metrics::IntGauge;
struct MyAllocator {
allocations: AtomicU64,
deallocations: AtomicU64,
allocated: AtomicUsize,
high: AtomicUsize,
}
unsafe impl GlobalAlloc for MyAllocator {
unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
self.allocations.fetch_add(1, Ordering::Relaxed);
let mut allocated = self.allocated.fetch_add(layout.size(), Ordering::Relaxed);
allocated += layout.size();
self.high.fetch_max(allocated, Ordering::Relaxed);
unsafe { System.alloc(layout) }
}
unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
self.deallocations.fetch_add(1, Ordering::Relaxed);
self.allocated.fetch_sub(layout.size(), Ordering::Relaxed);
unsafe { System.dealloc(ptr, layout) }
}
}
#[global_allocator]
static GLOBAL: MyAllocator = MyAllocator {
allocations: AtomicU64::new(0),
deallocations: AtomicU64::new(0),
allocated: AtomicUsize::new(0),
high: AtomicUsize::new(0),
};
pub struct MyAllocatorCollector {
allocations: IntGauge,
deallocations: IntGauge,
allocated: IntGauge,
high: IntGauge,
}
impl MyAllocatorCollector {
pub fn new() -> MyAllocatorCollector {
MyAllocatorCollector {
allocations: IntGauge::new("allocations_total", "Number of allocations in Rust code")
.unwrap(),
deallocations: IntGauge::new(
"deallocations_total",
"Number of deallocations in Rust code",
)
.unwrap(),
allocated: IntGauge::new("allocated_total", "Bytes currently allocated").unwrap(),
high: IntGauge::new("allocated_high", "High watermark of allocated bytes").unwrap(),
}
}
}
impl metrics::core::Collector for MyAllocatorCollector {
fn desc(&self) -> Vec<&metrics::core::Desc> {
let mut descs = Vec::new();
descs.append(&mut self.allocations.desc());
descs.append(&mut self.deallocations.desc());
descs.append(&mut self.allocated.desc());
descs.append(&mut self.high.desc());
descs
}
fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
let mut values = Vec::new();
// update the gauges
self.allocations
.set(GLOBAL.allocations.load(Ordering::Relaxed) as i64);
self.deallocations
.set(GLOBAL.allocations.load(Ordering::Relaxed) as i64);
self.allocated
.set(GLOBAL.allocated.load(Ordering::Relaxed) as i64);
self.high.set(GLOBAL.high.load(Ordering::Relaxed) as i64);
values.append(&mut self.allocations.collect());
values.append(&mut self.deallocations.collect());
values.append(&mut self.allocated.collect());
values.append(&mut self.high.collect());
values
}
}

View File

@@ -1,168 +0,0 @@
//! Initialization functions. These are executed in the postmaster process,
//! at different stages of server startup.
//!
//!
//! Communicator initialization steps:
//!
//! 1. At postmaster startup, before shared memory is allocated,
//! rcommunicator_shmem_size() is called to get the amount of
//! shared memory that this module needs.
//!
//! 2. Later, after the shared memory has been allocated,
//! rcommunicator_shmem_init() is called to initialize the shmem
//! area.
//!
//! Per process initialization:
//!
//! When a backend process starts up, it calls rcommunicator_backend_init().
//! In the communicator worker process, other functions are called, see
//! `worker_process` module.
use std::ffi::c_int;
use std::mem;
use std::mem::MaybeUninit;
use std::os::fd::OwnedFd;
use crate::backend_comms::NeonIORequestSlot;
use crate::integrated_cache::IntegratedCacheInitStruct;
/// This struct is created in the postmaster process, and inherited to
/// the communicator process and all backend processes through fork()
#[repr(C)]
pub struct CommunicatorInitStruct {
pub submission_pipe_read_fd: OwnedFd,
pub submission_pipe_write_fd: OwnedFd,
// Shared memory data structures
pub num_neon_request_slots: u32,
pub neon_request_slots: &'static [NeonIORequestSlot],
pub integrated_cache_init_struct: IntegratedCacheInitStruct<'static>,
}
impl std::fmt::Debug for CommunicatorInitStruct {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
fmt.debug_struct("CommunicatorInitStruct")
.field("submission_pipe_read_fd", &self.submission_pipe_read_fd)
.field("submission_pipe_write_fd", &self.submission_pipe_write_fd)
.field("num_neon_request_slots", &self.num_neon_request_slots)
.field("neon_request_slots length", &self.neon_request_slots.len())
.finish()
}
}
#[unsafe(no_mangle)]
pub extern "C" fn rcommunicator_shmem_size(num_neon_request_slots: u32) -> u64 {
let mut size = 0;
size += mem::size_of::<NeonIORequestSlot>() * num_neon_request_slots as usize;
// For integrated_cache's Allocator. TODO: make this adjustable
size += IntegratedCacheInitStruct::shmem_size();
size as u64
}
/// Initialize the shared memory segment. Returns a backend-private
/// struct, which will be inherited by backend processes through fork
#[unsafe(no_mangle)]
pub extern "C" fn rcommunicator_shmem_init(
submission_pipe_read_fd: c_int,
submission_pipe_write_fd: c_int,
num_neon_request_slots: u32,
shmem_area_ptr: *mut MaybeUninit<u8>,
shmem_area_len: u64,
initial_file_cache_size: u64,
max_file_cache_size: u64,
) -> &'static mut CommunicatorInitStruct {
let shmem_area: &'static mut [MaybeUninit<u8>] =
unsafe { std::slice::from_raw_parts_mut(shmem_area_ptr, shmem_area_len as usize) };
let (neon_request_slots, remaining_area) =
alloc_array_from_slice::<NeonIORequestSlot>(shmem_area, num_neon_request_slots as usize);
for slot in neon_request_slots.iter_mut() {
slot.write(NeonIORequestSlot::default());
}
// 'neon_request_slots' is initialized now. (MaybeUninit::slice_assume_init_mut() is nightly-only
// as of this writing.)
let neon_request_slots = unsafe {
std::mem::transmute::<&mut [MaybeUninit<NeonIORequestSlot>], &mut [NeonIORequestSlot]>(
neon_request_slots,
)
};
// Give the rest of the area to the integrated cache
let integrated_cache_init_struct = IntegratedCacheInitStruct::shmem_init(
remaining_area,
initial_file_cache_size,
max_file_cache_size,
);
let (submission_pipe_read_fd, submission_pipe_write_fd) = unsafe {
use std::os::fd::FromRawFd;
(
OwnedFd::from_raw_fd(submission_pipe_read_fd),
OwnedFd::from_raw_fd(submission_pipe_write_fd),
)
};
let cis: &'static mut CommunicatorInitStruct = Box::leak(Box::new(CommunicatorInitStruct {
submission_pipe_read_fd,
submission_pipe_write_fd,
num_neon_request_slots,
neon_request_slots,
integrated_cache_init_struct,
}));
cis
}
// fixme: currently unused
#[allow(dead_code)]
pub fn alloc_from_slice<T>(
area: &mut [MaybeUninit<u8>],
) -> (&mut MaybeUninit<T>, &mut [MaybeUninit<u8>]) {
let layout = std::alloc::Layout::new::<T>();
let area_start = area.as_mut_ptr();
// pad to satisfy alignment requirements
let padding = area_start.align_offset(layout.align());
if padding + layout.size() > area.len() {
panic!("out of memory");
}
let area = &mut area[padding..];
let (result_area, remain) = area.split_at_mut(layout.size());
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
let result = unsafe { result_ptr.as_mut().unwrap() };
(result, remain)
}
pub fn alloc_array_from_slice<T>(
area: &mut [MaybeUninit<u8>],
len: usize,
) -> (&mut [MaybeUninit<T>], &mut [MaybeUninit<u8>]) {
let layout = std::alloc::Layout::new::<T>();
let area_start = area.as_mut_ptr();
// pad to satisfy alignment requirements
let padding = area_start.align_offset(layout.align());
if padding + layout.size() * len > area.len() {
panic!("out of memory");
}
let area = &mut area[padding..];
let (result_area, remain) = area.split_at_mut(layout.size() * len);
let result_ptr: *mut MaybeUninit<T> = result_area.as_mut_ptr().cast();
let result = unsafe { std::slice::from_raw_parts_mut(result_ptr.as_mut().unwrap(), len) };
(result, remain)
}

View File

@@ -1,824 +0,0 @@
//! Integrated communicator cache
//!
//! It tracks:
//! - Relation sizes and existence
//! - Last-written LSN
//! - Block cache (also known as LFC)
//!
//! TODO: limit the size
//! TODO: concurrency
//!
//! Note: This deals with "relations" which is really just one "relation fork" in Postgres
//! terms. RelFileLocator + ForkNumber is the key.
//
// TODO: Thoughts on eviction:
//
// There are two things we need to track, and evict if we run out of space:
// - blocks in the file cache's file. If the file grows too large, need to evict something.
// Also if the cache is resized
//
// - entries in the cache map. If we run out of memory in the shmem area, need to evict
// something
//
use std::mem::MaybeUninit;
use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
use utils::lsn::{AtomicLsn, Lsn};
use crate::file_cache::INVALID_CACHE_BLOCK;
use crate::file_cache::{CacheBlock, FileCache};
use pageserver_page_api::RelTag;
use metrics::{IntCounter, IntGauge};
use neon_shmem::hash::{HashMapInit, entry::Entry};
use neon_shmem::shmem::ShmemHandle;
// in # of entries
const RELSIZE_CACHE_SIZE: u32 = 64 * 1024;
/// This struct is initialized at postmaster startup, and passed to all the processes via fork().
pub struct IntegratedCacheInitStruct<'t> {
relsize_cache_handle: HashMapInit<'t, RelKey, RelEntry>,
block_map_handle: HashMapInit<'t, BlockKey, BlockEntry>,
}
/// Represents write-access to the integrated cache. This is used by the communicator process.
#[derive(Debug)]
pub struct IntegratedCacheWriteAccess<'t> {
relsize_cache: neon_shmem::hash::HashMapAccess<'t, RelKey, RelEntry>,
block_map: neon_shmem::hash::HashMapAccess<'t, BlockKey, BlockEntry>,
global_lw_lsn: AtomicU64,
pub(crate) file_cache: Option<FileCache>,
// Fields for eviction
clock_hand: std::sync::Mutex<usize>,
// Metrics
page_evictions_counter: IntCounter,
clock_iterations_counter: IntCounter,
// metrics from the hash map
block_map_num_buckets: IntGauge,
block_map_num_buckets_in_use: IntGauge,
relsize_cache_num_buckets: IntGauge,
relsize_cache_num_buckets_in_use: IntGauge,
}
/// Represents read-only access to the integrated cache. Backend processes have this.
pub struct IntegratedCacheReadAccess<'t> {
relsize_cache: neon_shmem::hash::HashMapAccess<'t, RelKey, RelEntry>,
block_map: neon_shmem::hash::HashMapAccess<'t, BlockKey, BlockEntry>,
}
impl<'t> IntegratedCacheInitStruct<'t> {
/// Return the desired size in bytes of the fixed-size shared memory area to reserve for the
/// integrated cache.
pub fn shmem_size() -> usize {
// The relsize cache is fixed-size. The block map is allocated in a separate resizable
// area.
HashMapInit::<RelKey, RelEntry>::estimate_size(RELSIZE_CACHE_SIZE)
}
/// Initialize the shared memory segment. This runs once in postmaster. Returns a struct which
/// will be inherited by all processes through fork.
pub fn shmem_init(
shmem_area: &'t mut [MaybeUninit<u8>],
initial_file_cache_size: u64,
max_file_cache_size: u64,
) -> IntegratedCacheInitStruct<'t> {
// Initialize the relsize cache in the fixed-size area
let relsize_cache_handle =
neon_shmem::hash::HashMapInit::with_fixed(RELSIZE_CACHE_SIZE, shmem_area);
let max_bytes =
HashMapInit::<BlockKey, BlockEntry>::estimate_size(max_file_cache_size as u32);
// Initialize the block map in a separate resizable shared memory area
let shmem_handle = ShmemHandle::new("block mapping", 0, max_bytes).unwrap();
let block_map_handle =
neon_shmem::hash::HashMapInit::with_shmem(initial_file_cache_size as u32, shmem_handle);
IntegratedCacheInitStruct {
relsize_cache_handle,
block_map_handle,
}
}
/// Initialize access to the integrated cache for the communicator worker process
pub fn worker_process_init(
self,
lsn: Lsn,
file_cache: Option<FileCache>,
) -> IntegratedCacheWriteAccess<'t> {
let IntegratedCacheInitStruct {
relsize_cache_handle,
block_map_handle,
} = self;
IntegratedCacheWriteAccess {
relsize_cache: relsize_cache_handle.attach_writer(),
block_map: block_map_handle.attach_writer(),
global_lw_lsn: AtomicU64::new(lsn.0),
file_cache,
clock_hand: std::sync::Mutex::new(0),
page_evictions_counter: metrics::IntCounter::new(
"integrated_cache_evictions",
"Page evictions from the Local File Cache",
)
.unwrap(),
clock_iterations_counter: metrics::IntCounter::new(
"clock_iterations",
"Number of times the clock hand has moved",
)
.unwrap(),
block_map_num_buckets: metrics::IntGauge::new(
"block_map_num_buckets",
"Allocated size of the block cache hash map",
)
.unwrap(),
block_map_num_buckets_in_use: metrics::IntGauge::new(
"block_map_num_buckets_in_use",
"Number of buckets in use in the block cache hash map",
)
.unwrap(),
relsize_cache_num_buckets: metrics::IntGauge::new(
"relsize_cache_num_buckets",
"Allocated size of the relsize cache hash map",
)
.unwrap(),
relsize_cache_num_buckets_in_use: metrics::IntGauge::new(
"relsize_cache_num_buckets_in_use",
"Number of buckets in use in the relsize cache hash map",
)
.unwrap(),
}
}
/// Initialize access to the integrated cache for a backend process
pub fn backend_init(self) -> IntegratedCacheReadAccess<'t> {
let IntegratedCacheInitStruct {
relsize_cache_handle,
block_map_handle,
} = self;
IntegratedCacheReadAccess {
relsize_cache: relsize_cache_handle.attach_reader(),
block_map: block_map_handle.attach_reader(),
}
}
}
/// Value stored in the cache mapping hash table.
struct BlockEntry {
lw_lsn: AtomicLsn,
cache_block: AtomicU64,
pinned: AtomicU64,
// 'referenced' bit for the clock algorithm
referenced: AtomicBool,
}
/// Value stored in the relsize cache hash table.
struct RelEntry {
/// cached size of the relation
/// u32::MAX means 'not known' (that's InvalidBlockNumber in Postgres)
nblocks: AtomicU32,
/// This is the last time the "metadata" of this relation changed, not
/// the contents of the blocks. That is, the size of the relation.
lw_lsn: AtomicLsn,
}
impl std::fmt::Debug for RelEntry {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
fmt.debug_struct("Rel")
.field("nblocks", &self.nblocks.load(Ordering::Relaxed))
.finish()
}
}
impl std::fmt::Debug for BlockEntry {
fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
fmt.debug_struct("Block")
.field("lw_lsn", &self.lw_lsn.load())
.field("cache_block", &self.cache_block.load(Ordering::Relaxed))
.field("pinned", &self.pinned.load(Ordering::Relaxed))
.field("referenced", &self.referenced.load(Ordering::Relaxed))
.finish()
}
}
#[derive(Clone, Debug, PartialEq, PartialOrd, Eq, Hash, Ord)]
struct RelKey(RelTag);
impl From<&RelTag> for RelKey {
fn from(val: &RelTag) -> RelKey {
RelKey(*val)
}
}
#[derive(Clone, Debug, PartialEq, PartialOrd, Eq, Hash, Ord)]
struct BlockKey {
rel: RelTag,
block_number: u32,
}
impl From<(&RelTag, u32)> for BlockKey {
fn from(val: (&RelTag, u32)) -> BlockKey {
BlockKey {
rel: *val.0,
block_number: val.1,
}
}
}
/// Return type used in the cache's get_*() functions. 'Found' means that the page, or other
/// information that was enqueried, exists in the cache. '
pub enum CacheResult<V> {
/// The enqueried page or other information existed in the cache.
Found(V),
/// The cache doesn't contain the page (or other enqueried information, like relation size). The
/// Lsn is the 'not_modified_since' LSN that should be used in the request to the pageserver to
/// read the page.
NotFound(Lsn),
}
impl<'t> IntegratedCacheWriteAccess<'t> {
pub fn get_rel_size(&'t self, rel: &RelTag) -> CacheResult<u32> {
if let Some(nblocks) = get_rel_size(&self.relsize_cache, rel) {
CacheResult::Found(nblocks)
} else {
let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
CacheResult::NotFound(lsn)
}
}
pub async fn get_page(
&'t self,
rel: &RelTag,
block_number: u32,
dst: impl uring_common::buf::IoBufMut + Send + Sync,
) -> Result<CacheResult<()>, std::io::Error> {
let x = if let Some(block_entry) = self.block_map.get(&BlockKey::from((rel, block_number)))
{
block_entry.referenced.store(true, Ordering::Relaxed);
let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
if cache_block != INVALID_CACHE_BLOCK {
// pin it and release lock
block_entry.pinned.fetch_add(1, Ordering::Relaxed);
(cache_block, DeferredUnpin(block_entry.pinned.as_ptr()))
} else {
return Ok(CacheResult::NotFound(block_entry.lw_lsn.load()));
}
} else {
let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
return Ok(CacheResult::NotFound(lsn));
};
let (cache_block, _deferred_pin) = x;
self.file_cache
.as_ref()
.unwrap()
.read_block(cache_block, dst)
.await?;
// unpin the entry (by implicitly dropping deferred_pin)
Ok(CacheResult::Found(()))
}
pub async fn page_is_cached(
&'t self,
rel: &RelTag,
block_number: u32,
) -> Result<CacheResult<()>, std::io::Error> {
if let Some(block_entry) = self.block_map.get(&BlockKey::from((rel, block_number))) {
// This is used for prefetch requests. Treat the probe as an 'access', to keep it
// in cache.
block_entry.referenced.store(true, Ordering::Relaxed);
let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
if cache_block != INVALID_CACHE_BLOCK {
Ok(CacheResult::Found(()))
} else {
Ok(CacheResult::NotFound(block_entry.lw_lsn.load()))
}
} else {
let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
Ok(CacheResult::NotFound(lsn))
}
}
/// Does the relation exists? CacheResult::NotFound means that the cache doesn't contain that
/// information, i.e. we don't know if the relation exists or not.
pub fn get_rel_exists(&'t self, rel: &RelTag) -> CacheResult<bool> {
// we don't currently cache negative entries, so if the relation is in the cache, it exists
if let Some(_rel_entry) = self.relsize_cache.get(&RelKey::from(rel)) {
CacheResult::Found(true)
} else {
let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
CacheResult::NotFound(lsn)
}
}
pub fn get_db_size(&'t self, _db_oid: u32) -> CacheResult<u64> {
// TODO: it would be nice to cache database sizes too. Getting the database size
// is not a very common operation, but when you do it, it's often interactive, with
// e.g. psql \l+ command, so the user will feel the latency.
// fixme: is this right lsn?
let lsn = Lsn(self.global_lw_lsn.load(Ordering::Relaxed));
CacheResult::NotFound(lsn)
}
pub fn remember_rel_size(&'t self, rel: &RelTag, nblocks: u32, lsn: Lsn) {
match self.relsize_cache.entry(RelKey::from(rel)) {
Entry::Vacant(e) => {
tracing::info!("inserting rel entry for {rel:?}, {nblocks} blocks");
// FIXME: what to do if we run out of memory? Evict other relation entries?
_ = e
.insert(RelEntry {
nblocks: AtomicU32::new(nblocks),
lw_lsn: AtomicLsn::new(lsn.0),
})
.expect("out of memory");
}
Entry::Occupied(e) => {
tracing::info!("updating rel entry for {rel:?}, {nblocks} blocks");
e.get().nblocks.store(nblocks, Ordering::Relaxed);
e.get().lw_lsn.store(lsn);
}
};
}
/// Remember the given page contents in the cache.
pub async fn remember_page(
&'t self,
rel: &RelTag,
block_number: u32,
src: impl uring_common::buf::IoBuf + Send + Sync,
lw_lsn: Lsn,
is_write: bool,
) {
let key = BlockKey::from((rel, block_number));
// FIXME: make this work when file cache is disabled. Or make it mandatory
let file_cache = self.file_cache.as_ref().unwrap();
if is_write {
// there should be no concurrent IOs. If a backend tries to read the page
// at the same time, they may get a torn write. That's the same as with
// regular POSIX filesystem read() and write()
// First check if we have a block in cache already
let mut old_cache_block = None;
let mut found_existing = false;
// NOTE(quantumish): honoring original semantics here (used to be update_with_fn)
// but I don't see any reason why this has to take a write lock.
if let Entry::Occupied(e) = self.block_map.entry(key.clone()) {
let block_entry = e.get();
found_existing = true;
// Prevent this entry from being evicted
let pin_count = block_entry.pinned.fetch_add(1, Ordering::Relaxed);
if pin_count > 0 {
// this is unexpected, because the caller has obtained the io-in-progress lock,
// so no one else should try to modify the page at the same time.
// XXX: and I think a read should not be happening either, because the postgres
// buffer is held locked. TODO: check these conditions and tidy this up a little. Seems fragile to just panic.
panic!("block entry was unexpectedly pinned");
}
let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
old_cache_block = if cache_block != INVALID_CACHE_BLOCK {
Some(cache_block)
} else {
None
};
}
// Allocate a new block if required
let cache_block = old_cache_block.unwrap_or_else(|| {
loop {
if let Some(x) = file_cache.alloc_block() {
break x;
}
if let Some(x) = self.try_evict_one_cache_block() {
break x;
}
}
});
// Write the page to the cache file
file_cache
.write_block(cache_block, src)
.await
.expect("error writing to cache");
// FIXME: handle errors gracefully.
// FIXME: unpin the block entry on error
// Update the block entry
let entry = self.block_map.entry(key);
assert_eq!(found_existing, matches!(entry, Entry::Occupied(_)));
match entry {
Entry::Occupied(e) => {
let block_entry = e.get();
// Update the cache block
let old_blk = block_entry.cache_block.compare_exchange(
INVALID_CACHE_BLOCK,
cache_block,
Ordering::Relaxed,
Ordering::Relaxed,
);
assert!(old_blk == Ok(INVALID_CACHE_BLOCK) || old_blk == Err(cache_block));
block_entry.lw_lsn.store(lw_lsn);
block_entry.referenced.store(true, Ordering::Relaxed);
let pin_count = block_entry.pinned.fetch_sub(1, Ordering::Relaxed);
assert!(pin_count > 0);
}
Entry::Vacant(e) => {
// FIXME: what to do if we run out of memory? Evict other relation entries? Remove
// block entries first?
_ = e
.insert(BlockEntry {
lw_lsn: AtomicLsn::new(lw_lsn.0),
cache_block: AtomicU64::new(cache_block),
pinned: AtomicU64::new(0),
referenced: AtomicBool::new(true),
})
.expect("out of memory");
}
}
} else {
// !is_write
//
// We can assume that it doesn't already exist, because the
// caller is assumed to have already checked it, and holds
// the io-in-progress lock. (The BlockEntry might exist, but no cache block)
// Allocate a new block first
let cache_block = {
loop {
if let Some(x) = file_cache.alloc_block() {
break x;
}
if let Some(x) = self.try_evict_one_cache_block() {
break x;
}
}
};
// Write the page to the cache file
file_cache
.write_block(cache_block, src)
.await
.expect("error writing to cache");
// FIXME: handle errors gracefully.
match self.block_map.entry(key) {
Entry::Occupied(e) => {
let block_entry = e.get();
// FIXME: could there be concurrent readers?
assert!(block_entry.pinned.load(Ordering::Relaxed) == 0);
let old_cache_block =
block_entry.cache_block.swap(cache_block, Ordering::Relaxed);
if old_cache_block != INVALID_CACHE_BLOCK {
panic!(
"remember_page called in !is_write mode, but page is already cached at blk {old_cache_block}"
);
}
}
Entry::Vacant(e) => {
// FIXME: what to do if we run out of memory? Evict other relation entries? Remove
// block entries first?
_ = e
.insert(BlockEntry {
lw_lsn: AtomicLsn::new(lw_lsn.0),
cache_block: AtomicU64::new(cache_block),
pinned: AtomicU64::new(0),
referenced: AtomicBool::new(true),
})
.expect("out of memory");
}
}
}
}
/// Forget information about given relation in the cache. (For DROP TABLE and such)
pub fn forget_rel(&'t self, rel: &RelTag, _nblocks: Option<u32>, flush_lsn: Lsn) {
tracing::info!("forgetting rel entry for {rel:?}");
self.relsize_cache.remove(&RelKey::from(rel));
// update with flush LSN
let _ = self.global_lw_lsn.fetch_max(flush_lsn.0, Ordering::Relaxed);
// also forget all cached blocks for the relation
// FIXME
/*
let mut iter = MapIterator::new(&key_range_for_rel_blocks(rel));
let r = self.cache_tree.start_read();
while let Some((k, _v)) = iter.next(&r) {
let w = self.cache_tree.start_write();
let mut evicted_cache_block = None;
let res = w.update_with_fn(&k, |e| {
if let Some(e) = e {
let block_entry = if let MapEntry::Block(e) = e {
e
} else {
panic!("unexpected map entry type for block key");
};
let cache_block = block_entry
.cache_block
.swap(INVALID_CACHE_BLOCK, Ordering::Relaxed);
if cache_block != INVALID_CACHE_BLOCK {
evicted_cache_block = Some(cache_block);
}
UpdateAction::Remove
} else {
UpdateAction::Nothing
}
});
// FIXME: It's pretty surprising to run out of memory while removing. But
// maybe it can happen because of trying to shrink a node?
res.expect("out of memory");
if let Some(evicted_cache_block) = evicted_cache_block {
self.file_cache
.as_ref()
.unwrap()
.dealloc_block(evicted_cache_block);
}
}
*/
}
// Maintenance routines
/// Evict one block from the file cache. This is used when the file cache fills up
/// Returns the evicted block. It's not put to the free list, so it's available for the
/// caller to use immediately.
pub fn try_evict_one_cache_block(&self) -> Option<CacheBlock> {
let mut clock_hand = self.clock_hand.lock().unwrap();
for _ in 0..100 {
self.clock_iterations_counter.inc();
(*clock_hand) += 1;
let mut evict_this = false;
let num_buckets = self.block_map.get_num_buckets();
match self
.block_map
.get_at_bucket((*clock_hand) % num_buckets)
.as_deref()
{
None => {
// This bucket was unused
}
Some((_, blk_entry)) => {
if !blk_entry.referenced.swap(false, Ordering::Relaxed) {
// Evict this. Maybe.
evict_this = true;
}
}
};
if evict_this {
// grab the write lock
let mut evicted_cache_block = None;
if let Some(e) = self.block_map.entry_at_bucket(*clock_hand % num_buckets) {
let old = e.get();
// note: all the accesses to 'pinned' currently happen
// within update_with_fn(), or while holding ValueReadGuard, which protects from concurrent
// updates. Otherwise, another thread could set the 'pinned'
// flag just after we have checked it here.
if old.pinned.load(Ordering::Relaxed) == 0 {
let _ = self
.global_lw_lsn
.fetch_max(old.lw_lsn.load().0, Ordering::Relaxed);
let cache_block =
old.cache_block.swap(INVALID_CACHE_BLOCK, Ordering::Relaxed);
if cache_block != INVALID_CACHE_BLOCK {
evicted_cache_block = Some(cache_block);
}
e.remove();
}
}
if evicted_cache_block.is_some() {
self.page_evictions_counter.inc();
return evicted_cache_block;
}
}
}
// Give up if we didn't find anything
None
}
/// Resize the local file cache.
pub fn resize_file_cache(&self, num_blocks: u32) {
let old_num_blocks = self.block_map.get_num_buckets() as u32;
if old_num_blocks < num_blocks {
if let Err(err) = self.block_map.grow(num_blocks) {
tracing::warn!(
"could not grow file cache to {} blocks (old size {}): {}",
num_blocks,
old_num_blocks,
err
);
}
} else {
// TODO: Shrinking not implemented yet
}
}
pub fn dump_map(&self, _dst: &mut dyn std::io::Write) {
//FIXME self.cache_map.start_read().dump(dst);
}
}
impl metrics::core::Collector for IntegratedCacheWriteAccess<'_> {
fn desc(&self) -> Vec<&metrics::core::Desc> {
let mut descs = Vec::new();
descs.append(&mut self.page_evictions_counter.desc());
descs.append(&mut self.clock_iterations_counter.desc());
descs.append(&mut self.block_map_num_buckets.desc());
descs.append(&mut self.block_map_num_buckets_in_use.desc());
descs.append(&mut self.relsize_cache_num_buckets.desc());
descs.append(&mut self.relsize_cache_num_buckets_in_use.desc());
descs
}
fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
// Update gauges
self.block_map_num_buckets
.set(self.block_map.get_num_buckets() as i64);
self.block_map_num_buckets_in_use
.set(self.block_map.get_num_buckets_in_use() as i64);
self.relsize_cache_num_buckets
.set(self.relsize_cache.get_num_buckets() as i64);
self.relsize_cache_num_buckets_in_use
.set(self.relsize_cache.get_num_buckets_in_use() as i64);
let mut values = Vec::new();
values.append(&mut self.page_evictions_counter.collect());
values.append(&mut self.clock_iterations_counter.collect());
values.append(&mut self.block_map_num_buckets.collect());
values.append(&mut self.block_map_num_buckets_in_use.collect());
values.append(&mut self.relsize_cache_num_buckets.collect());
values.append(&mut self.relsize_cache_num_buckets_in_use.collect());
values
}
}
/// Read relation size from the cache.
///
/// This is in a separate function so that it can be shared by
/// IntegratedCacheReadAccess::get_rel_size() and IntegratedCacheWriteAccess::get_rel_size()
fn get_rel_size(
r: &neon_shmem::hash::HashMapAccess<RelKey, RelEntry>,
rel: &RelTag,
) -> Option<u32> {
if let Some(rel_entry) = r.get(&RelKey::from(rel)) {
let nblocks = rel_entry.nblocks.load(Ordering::Relaxed);
if nblocks != u32::MAX {
Some(nblocks)
} else {
None
}
} else {
None
}
}
pub enum GetBucketResult {
Occupied(RelTag, u32),
Vacant,
OutOfBounds,
}
/// Accessor for other backends
///
/// This allows backends to read pages from the cache directly, on their own, without making a
/// request to the communicator process.
impl<'t> IntegratedCacheReadAccess<'t> {
pub fn get_rel_size(&'t self, rel: &RelTag) -> Option<u32> {
get_rel_size(&self.relsize_cache, rel)
}
pub fn start_read_op(&'t self) -> BackendCacheReadOp<'t> {
BackendCacheReadOp {
read_guards: Vec::new(),
map_access: self,
}
}
/// Check if the given page is present in the cache
pub fn cache_contains_page(&'t self, rel: &RelTag, block_number: u32) -> bool {
self.block_map
.get(&BlockKey::from((rel, block_number)))
.is_some()
}
pub fn get_bucket(&self, bucket_no: usize) -> GetBucketResult {
match self.block_map.get_at_bucket(bucket_no).as_deref() {
None => {
// free bucket, or out of bounds
if bucket_no >= self.block_map.get_num_buckets() {
GetBucketResult::OutOfBounds
} else {
GetBucketResult::Vacant
}
}
Some((key, _)) => GetBucketResult::Occupied(key.rel, key.block_number),
}
}
}
pub struct BackendCacheReadOp<'t> {
read_guards: Vec<DeferredUnpin>,
map_access: &'t IntegratedCacheReadAccess<'t>,
}
impl<'e> BackendCacheReadOp<'e> {
/// Initiate a read of the page from the cache.
///
/// This returns the "cache block number", i.e. the block number within the cache file, where
/// the page's contents is stored. To get the page contents, the caller needs to read that block
/// from the cache file. This returns a guard object that you must hold while it performs the
/// read. It's possible that while you are performing the read, the cache block is invalidated.
/// After you have completed the read, call BackendCacheReadResult::finish() to check if the
/// read was in fact valid or not. If it was concurrently invalidated, you need to retry.
pub fn get_page(&mut self, rel: &RelTag, block_number: u32) -> Option<u64> {
if let Some(block_entry) = self
.map_access
.block_map
.get(&BlockKey::from((rel, block_number)))
{
block_entry.referenced.store(true, Ordering::Relaxed);
let cache_block = block_entry.cache_block.load(Ordering::Relaxed);
if cache_block != INVALID_CACHE_BLOCK {
block_entry.pinned.fetch_add(1, Ordering::Relaxed);
self.read_guards
.push(DeferredUnpin(block_entry.pinned.as_ptr()));
Some(cache_block)
} else {
None
}
} else {
None
}
}
pub fn finish(self) -> bool {
// TODO: currently, we hold a pin on the in-memory map, so concurrent invalidations are not
// possible. But if we switch to optimistic locking, this would return 'false' if the
// optimistic locking failed and you need to retry.
true
}
}
/// A hack to decrement an AtomicU64 on drop. This is used to decrement the pin count
/// of a BlockEntry. The safety depends on the fact that the BlockEntry is not evicted
/// or moved while it's pinned.
struct DeferredUnpin(*mut u64);
unsafe impl Sync for DeferredUnpin {}
unsafe impl Send for DeferredUnpin {}
impl Drop for DeferredUnpin {
fn drop(&mut self) {
// unpin it
unsafe {
let pin_ref = AtomicU64::from_ptr(self.0);
pin_ref.fetch_sub(1, Ordering::Relaxed);
}
}
}

View File

@@ -1,29 +1,5 @@
//! Three main parts:
//! - async tokio communicator core, which receives requests and processes them.
//! - Main loop and requests queues, which routes requests from backends to the core
//! - the per-backend glue code, which submits requests
mod backend_comms;
// mark this 'pub', because these functions are called from C code. Otherwise, the compiler
// complains about a bunch of structs and enum variants being unused, because it thinkgs
// the functions that use them are never called. There are some C-callable functions in
// other modules too, but marking this as pub is currently enough to silence the warnings
//
// TODO: perhaps collect *all* the extern "C" functions to one module?
pub mod backend_interface;
mod file_cache;
mod init;
mod integrated_cache;
mod neon_request;
mod worker_process;
mod global_allocator;
/// Name of the Unix Domain Socket that serves the metrics, and other APIs in the
/// future. This is within the Postgres data directory.
const NEON_COMMUNICATOR_SOCKET_NAME: &str = "neon-communicator.socket";
// FIXME: get this from postgres headers somehow
pub const BLCKSZ: usize = 8192;

View File

@@ -1,432 +0,0 @@
// Definitions of some core PostgreSQL datatypes.
/// XLogRecPtr is defined in "access/xlogdefs.h" as:
///
/// ```
/// typedef uint64 XLogRecPtr;
/// ```
/// cbindgen:no-export
pub type XLogRecPtr = u64;
pub type CLsn = XLogRecPtr;
pub type COid = u32;
// This conveniently matches PG_IOV_MAX
pub const MAX_GETPAGEV_PAGES: usize = 32;
pub const INVALID_BLOCK_NUMBER: u32 = u32::MAX;
use std::ffi::CStr;
use pageserver_page_api::{self as page_api, SlruKind};
/// Request from a Postgres backend to the communicator process
#[allow(clippy::large_enum_variant)]
#[repr(C)]
#[derive(Copy, Clone, Debug, strum_macros::EnumDiscriminants)]
#[strum_discriminants(derive(measured::FixedCardinalityLabel))]
pub enum NeonIORequest {
Empty,
// Read requests. These are C-friendly variants of the corresponding structs in
// pageserver_page_api.
RelSize(CRelSizeRequest),
GetPageV(CGetPageVRequest),
ReadSlruSegment(CReadSlruSegmentRequest),
PrefetchV(CPrefetchVRequest),
DbSize(CDbSizeRequest),
// Write requests. These are needed to keep the relation size cache and LFC up-to-date.
// They are not sent to the pageserver.
WritePage(CWritePageRequest),
RelExtend(CRelExtendRequest),
RelZeroExtend(CRelZeroExtendRequest),
RelCreate(CRelCreateRequest),
RelTruncate(CRelTruncateRequest),
RelUnlink(CRelUnlinkRequest),
// Other requests
UpdateCachedRelSize(CUpdateCachedRelSizeRequest),
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub enum NeonIOResult {
Empty,
/// InvalidBlockNumber == 0xffffffff means "rel does not exist"
RelSize(u32),
/// the result pages are written to the shared memory addresses given in the request
GetPageV,
/// The result is written to the file, path to which is provided
/// in the request. The [`u64`] value here is the number of blocks.
ReadSlruSegment(u64),
/// A prefetch request returns as soon as the request has been received by the communicator.
/// It is processed in the background.
PrefetchVLaunched,
DbSize(u64),
// FIXME design compact error codes. Can't easily pass a string or other dynamic data.
// currently, this is 'errno'
Error(i32),
Aborted,
/// used for all write requests
WriteOK,
}
impl NeonIORequest {
/// All requests include a unique request ID, which can be used to trace the execution
/// of a request all the way to the pageservers. The request ID needs to be unique
/// within the lifetime of the Postgres instance (but not across servers or across
/// restarts of the same server).
pub fn request_id(&self) -> u64 {
use NeonIORequest::*;
match self {
Empty => 0,
RelSize(req) => req.request_id,
GetPageV(req) => req.request_id,
ReadSlruSegment(req) => req.request_id,
PrefetchV(req) => req.request_id,
DbSize(req) => req.request_id,
WritePage(req) => req.request_id,
RelExtend(req) => req.request_id,
RelZeroExtend(req) => req.request_id,
RelCreate(req) => req.request_id,
RelTruncate(req) => req.request_id,
RelUnlink(req) => req.request_id,
UpdateCachedRelSize(req) => req.request_id,
}
}
}
/// Special quick result to a CGetPageVRequest request, indicating that the
/// the requested pages are present in the local file cache. The backend can
/// read the blocks directly from the given LFC blocks.
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CCachedGetPageVResult {
pub cache_block_numbers: [u64; MAX_GETPAGEV_PAGES],
}
/// ShmemBuf represents a buffer in shared memory.
///
/// SAFETY: The pointer must point to an area in shared memory. The functions allow you to liberally
/// get a mutable pointer to the contents; it is the caller's responsibility to ensure that you
/// don't access a buffer that's you're not allowed to. Inappropriate access to the buffer doesn't
/// violate Rust's safety semantics, but it will mess up and crash Postgres.
///
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct ShmemBuf {
// Pointer to where the result is written or where to read from. Must point into a buffer in shared memory!
pub ptr: *mut u8,
}
unsafe impl Send for ShmemBuf {}
unsafe impl Sync for ShmemBuf {}
unsafe impl uring_common::buf::IoBuf for ShmemBuf {
fn stable_ptr(&self) -> *const u8 {
self.ptr
}
fn bytes_init(&self) -> usize {
crate::BLCKSZ
}
fn bytes_total(&self) -> usize {
crate::BLCKSZ
}
}
unsafe impl uring_common::buf::IoBufMut for ShmemBuf {
fn stable_mut_ptr(&mut self) -> *mut u8 {
self.ptr
}
unsafe fn set_init(&mut self, pos: usize) {
if pos > crate::BLCKSZ {
panic!(
"set_init called past end of buffer, pos {}, buffer size {}",
pos,
crate::BLCKSZ
);
}
}
}
impl ShmemBuf {
pub fn as_mut_ptr(&self) -> *mut u8 {
self.ptr
}
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelSizeRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub allow_missing: bool,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CGetPageVRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u8,
// These fields define where the result is written. Must point into a buffer in shared memory!
pub dest: [ShmemBuf; MAX_GETPAGEV_PAGES],
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CReadSlruSegmentRequest {
pub request_id: u64,
pub slru_kind: SlruKind,
pub segment_number: u32,
pub request_lsn: CLsn,
/// Must be a null-terminated C string containing the file path
/// where the communicator will write the SLRU segment.
pub destination_file_path: ShmemBuf,
}
impl CReadSlruSegmentRequest {
/// Returns the file path where the communicator will write the
/// SLRU segment.
pub(crate) fn destination_file_path(&self) -> String {
unsafe { CStr::from_ptr(self.destination_file_path.as_mut_ptr() as *const _) }
.to_string_lossy()
.into_owned()
}
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CPrefetchVRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u8,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CDbSizeRequest {
pub request_id: u64,
pub db_oid: COid,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CWritePageRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub lsn: CLsn,
// `src` defines the new page contents. Must point into a buffer in shared memory!
pub src: ShmemBuf,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelExtendRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub lsn: CLsn,
// `src` defines the new page contents. Must point into a buffer in shared memory!
pub src: ShmemBuf,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelZeroExtendRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub block_number: u32,
pub nblocks: u32,
pub lsn: CLsn,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelCreateRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub lsn: CLsn,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelTruncateRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub nblocks: u32,
pub lsn: CLsn,
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CRelUnlinkRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub lsn: CLsn,
}
impl CRelSizeRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CGetPageVRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CPrefetchVRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CWritePageRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelExtendRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelZeroExtendRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelCreateRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelTruncateRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
impl CRelUnlinkRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}
#[repr(C)]
#[derive(Copy, Clone, Debug)]
pub struct CUpdateCachedRelSizeRequest {
pub request_id: u64,
pub spc_oid: COid,
pub db_oid: COid,
pub rel_number: u32,
pub fork_number: u8,
pub nblocks: u32,
pub lsn: CLsn,
}
impl CUpdateCachedRelSizeRequest {
pub fn reltag(&self) -> page_api::RelTag {
page_api::RelTag {
spcnode: self.spc_oid,
dbnode: self.db_oid,
relnode: self.rel_number,
forknum: self.fork_number,
}
}
}

View File

@@ -4,13 +4,10 @@
//!
//! These are called from the communicator threads! Careful what you do, most Postgres
//! functions are not safe to call in that context.
use utils::lsn::Lsn;
#[cfg(not(test))]
unsafe extern "C" {
pub fn notify_proc_unsafe(procno: std::ffi::c_int);
pub fn callback_set_my_latch_unsafe();
pub fn callback_get_request_lsn_unsafe() -> crate::neon_request::CLsn;
pub fn callback_get_lfc_metrics_unsafe() -> LfcMetrics;
}
@@ -19,36 +16,20 @@ unsafe extern "C" {
// package, but the code coverage build still builds these and tries to link with the
// external C code.)
#[cfg(test)]
unsafe fn notify_proc_unsafe(_procno: std::ffi::c_int) {
panic!("not usable in unit tests");
}
#[cfg(test)]
unsafe fn callback_set_my_latch_unsafe() {
panic!("not usable in unit tests");
}
#[cfg(test)]
unsafe fn callback_get_request_lsn_unsafe() -> crate::neon_request::CLsn {
panic!("not usable in unit tests");
}
#[cfg(test)]
unsafe fn callback_get_lfc_metrics_unsafe() -> LfcMetrics {
panic!("not usable in unit tests");
}
// safe wrappers
pub(super) fn notify_proc(procno: std::ffi::c_int) {
unsafe { notify_proc_unsafe(procno) };
}
pub(super) fn callback_set_my_latch() {
unsafe { callback_set_my_latch_unsafe() };
}
pub(super) fn get_request_lsn() -> Lsn {
Lsn(unsafe { callback_get_request_lsn_unsafe() })
}
pub(super) fn callback_get_lfc_metrics() -> LfcMetrics {
unsafe { callback_get_lfc_metrics_unsafe() }
}

View File

@@ -28,7 +28,7 @@ use tokio::net::UnixListener;
use crate::NEON_COMMUNICATOR_SOCKET_NAME;
use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
impl<'a> CommunicatorWorkerProcessStruct<'a> {
impl CommunicatorWorkerProcessStruct {
/// Launch the listener
pub(crate) async fn launch_control_socket_listener(
&'static self,
@@ -38,7 +38,6 @@ impl<'a> CommunicatorWorkerProcessStruct<'a> {
.route("/metrics", get(get_metrics))
.route("/autoscaling_metrics", get(get_autoscaling_metrics))
.route("/debug/panic", get(handle_debug_panic))
.route("/debug/dump_cache_map", get(dump_cache_map))
.with_state(self);
// If the server is restarted, there might be an old socket still
@@ -69,7 +68,7 @@ impl<'a> CommunicatorWorkerProcessStruct<'a> {
}
/// Expose all Prometheus metrics.
async fn get_metrics(State(state): State<&CommunicatorWorkerProcessStruct<'_>>) -> Response {
async fn get_metrics(State(state): State<&CommunicatorWorkerProcessStruct>) -> Response {
tracing::trace!("/metrics requested");
metrics_to_response(&state).await
}
@@ -78,15 +77,13 @@ async fn get_metrics(State(state): State<&CommunicatorWorkerProcessStruct<'_>>)
///
/// This is a subset of all the metrics.
async fn get_autoscaling_metrics(
State(state): State<&CommunicatorWorkerProcessStruct<'_>>,
State(state): State<&CommunicatorWorkerProcessStruct>,
) -> Response {
tracing::trace!("/metrics requested");
metrics_to_response(&state.lfc_metrics).await
}
async fn handle_debug_panic(
State(_state): State<&CommunicatorWorkerProcessStruct<'_>>,
) -> Response {
async fn handle_debug_panic(State(_state): State<&CommunicatorWorkerProcessStruct>) -> Response {
panic!("test HTTP handler task panic");
}
@@ -103,16 +100,3 @@ async fn metrics_to_response(metrics: &(dyn MetricGroup<BufferedTextEncoder> + S
.body(Body::from(enc.finish()))
.unwrap()
}
async fn dump_cache_map(
State(state): State<&CommunicatorWorkerProcessStruct<'static>>,
) -> Response {
let mut buf: Vec<u8> = Vec::new();
state.cache.dump_map(&mut buf);
Response::builder()
.status(StatusCode::OK)
.header(CONTENT_TYPE, "application/text")
.body(Body::from(buf))
.unwrap()
}

View File

@@ -1,95 +0,0 @@
//! Lock table to ensure that only one IO request is in flight for a given
//! block (or relation or database metadata) at a time
use std::cmp::Eq;
use std::hash::Hash;
use std::sync::Arc;
use tokio::sync::{Mutex, OwnedMutexGuard};
use clashmap::ClashMap;
use clashmap::Entry;
use pageserver_page_api::RelTag;
#[derive(Clone, Eq, Hash, PartialEq)]
pub enum RequestInProgressKey {
Db(u32),
Rel(RelTag),
Block(RelTag, u32),
}
type RequestId = u64;
pub type RequestInProgressTable = MutexHashMap<RequestInProgressKey, RequestId>;
// more primitive locking thingie:
pub struct MutexHashMap<K, V>
where
K: Clone + Eq + Hash,
{
lock_table: ClashMap<K, (V, Arc<Mutex<()>>)>,
}
pub struct MutexHashMapGuard<'a, K, V>
where
K: Clone + Eq + Hash,
{
pub key: K,
map: &'a MutexHashMap<K, V>,
mutex: Arc<Mutex<()>>,
_guard: OwnedMutexGuard<()>,
}
impl<'a, K, V> Drop for MutexHashMapGuard<'a, K, V>
where
K: Clone + Eq + Hash,
{
fn drop(&mut self) {
let (_old_key, old_val) = self.map.lock_table.remove(&self.key).unwrap();
assert!(Arc::ptr_eq(&old_val.1, &self.mutex));
// the guard will be dropped as we return
}
}
impl<K, V> MutexHashMap<K, V>
where
K: Clone + Eq + Hash,
V: std::fmt::Display + Copy,
{
pub fn new() -> MutexHashMap<K, V> {
MutexHashMap {
lock_table: ClashMap::new(),
}
}
pub async fn lock<'a>(&'a self, key: K, val: V) -> MutexHashMapGuard<'a, K, V> {
let my_mutex = Arc::new(Mutex::new(()));
let my_guard = Arc::clone(&my_mutex).lock_owned().await;
loop {
let (request_id, lock) = match self.lock_table.entry(key.clone()) {
Entry::Occupied(e) => {
let e = e.get();
(e.0, Arc::clone(&e.1))
}
Entry::Vacant(e) => {
e.insert((val, Arc::clone(&my_mutex)));
break;
}
};
tracing::info!("waiting for conflicting IO {request_id} to complete");
let _ = lock.lock().await;
tracing::info!("conflicting IO {request_id} completed");
}
MutexHashMapGuard {
key,
map: self,
mutex: my_mutex,
_guard: my_guard,
}
}
}

View File

@@ -1,111 +1,34 @@
use std::collections::HashMap;
use std::os::fd::AsRawFd;
use std::os::fd::OwnedFd;
use std::path::PathBuf;
use std::str::FromStr as _;
use crate::backend_comms::NeonIORequestSlot;
use crate::file_cache::FileCache;
use crate::global_allocator::MyAllocatorCollector;
use crate::init::CommunicatorInitStruct;
use crate::integrated_cache::{CacheResult, IntegratedCacheWriteAccess};
use crate::neon_request::{CGetPageVRequest, CPrefetchVRequest};
use crate::neon_request::{INVALID_BLOCK_NUMBER, NeonIORequest, NeonIOResult};
use crate::worker_process::in_progress_ios::{RequestInProgressKey, RequestInProgressTable};
use crate::worker_process::lfc_metrics::LfcMetricsCollector;
use pageserver_client_grpc::{PageserverClient, ShardSpec, ShardStripeSize};
use pageserver_page_api as page_api;
use tokio::io::AsyncReadExt;
use tokio_pipe::PipeRead;
use uring_common::buf::IoBuf;
use measured::MetricGroup;
use measured::metric::MetricEncoding;
use measured::metric::gauge::GaugeState;
use measured::metric::group::Encoding;
use measured::{Gauge, GaugeVec};
use utils::id::{TenantId, TimelineId};
use super::callbacks::{get_request_lsn, notify_proc};
use tracing::{debug, error, info, info_span, trace};
use utils::lsn::Lsn;
pub struct CommunicatorWorkerProcessStruct<'a> {
/// Tokio runtime that the main loop and any other related tasks runs in.
pub struct CommunicatorWorkerProcessStruct {
runtime: tokio::runtime::Runtime,
/// Client to communicate with the pageserver
client: Option<PageserverClient>,
/// Request slots that backends use to send IO requests to the communicator.
neon_request_slots: &'a [NeonIORequestSlot],
/// Notification pipe. Backends use this to notify the communicator that a request is waiting to
/// be processed in one of the request slots.
submission_pipe_read_fd: OwnedFd,
/// Locking table for all in-progress IO requests.
in_progress_table: RequestInProgressTable,
/// Local File Cache, relation size tracking, last-written LSN tracking
pub(crate) cache: IntegratedCacheWriteAccess<'a>,
/*** Metrics ***/
pub(crate) lfc_metrics: LfcMetricsCollector,
request_counters: GaugeVec<RequestTypeLabelGroupSet>,
getpage_cache_misses_counter: Gauge,
getpage_cache_hits_counter: Gauge,
// For the requests that affect multiple blocks, have separate counters for the # of blocks affected
request_nblocks_counters: GaugeVec<RequestTypeLabelGroupSet>,
#[allow(dead_code)]
allocator_metrics: MyAllocatorCollector,
}
// Define a label group, consisting of 1 or more label values
#[derive(measured::LabelGroup)]
#[label(set = RequestTypeLabelGroupSet)]
struct RequestTypeLabelGroup {
request_type: crate::neon_request::NeonIORequestDiscriminants,
}
impl RequestTypeLabelGroup {
fn from_req(req: &NeonIORequest) -> Self {
RequestTypeLabelGroup {
request_type: req.into(),
}
}
}
/// Launch the communicator process's Rust subsystems
#[allow(clippy::too_many_arguments)]
pub(super) fn init(
cis: CommunicatorInitStruct,
tenant_id: Option<&str>,
timeline_id: Option<&str>,
auth_token: Option<&str>,
shard_map: HashMap<utils::shard::ShardIndex, String>,
stripe_size: Option<ShardStripeSize>,
initial_file_cache_size: u64,
file_cache_path: Option<PathBuf>,
) -> Result<&'static CommunicatorWorkerProcessStruct<'static>, String> {
) -> Result<&'static CommunicatorWorkerProcessStruct, String> {
// The caller validated these already
let tenant_id = tenant_id
let _tenant_id = tenant_id
.map(TenantId::from_str)
.transpose()
.map_err(|e| format!("invalid tenant ID: {e}"))?;
let timeline_id = timeline_id
let _timeline_id = timeline_id
.map(TimelineId::from_str)
.transpose()
.map_err(|e| format!("invalid timeline ID: {e}"))?;
let shard_spec =
ShardSpec::new(shard_map, stripe_size).map_err(|e| format!("invalid shard spec: {e}:"))?;
let runtime = tokio::runtime::Builder::new_multi_thread()
.enable_all()
@@ -113,72 +36,16 @@ pub(super) fn init(
.build()
.unwrap();
let last_lsn = get_request_lsn();
let file_cache = if let Some(path) = file_cache_path {
Some(FileCache::new(&path, initial_file_cache_size).expect("could not create cache file"))
} else {
// FIXME: temporarily for testing, use LFC even if disabled
Some(
FileCache::new(&PathBuf::from("new_filecache"), 1000)
.expect("could not create cache file"),
)
};
// Initialize subsystems
let cache = cis
.integrated_cache_init_struct
.worker_process_init(last_lsn, file_cache);
debug!("Initialised integrated cache: {cache:?}");
let client = if let (Some(tenant_id), Some(timeline_id)) = (tenant_id, timeline_id) {
let _guard = runtime.enter();
Some(
PageserverClient::new(
tenant_id,
timeline_id,
shard_spec,
auth_token.map(|s| s.to_string()),
None,
)
.expect("could not create client"),
)
} else {
None
};
let worker_struct = CommunicatorWorkerProcessStruct {
// Note: it's important to not drop the runtime, or all the tasks are dropped
// too. Including it in the returned struct is one way to keep it around.
runtime,
neon_request_slots: cis.neon_request_slots,
client,
cache,
submission_pipe_read_fd: cis.submission_pipe_read_fd,
in_progress_table: RequestInProgressTable::new(),
// metrics
lfc_metrics: LfcMetricsCollector,
request_counters: GaugeVec::new(),
getpage_cache_misses_counter: Gauge::new(),
getpage_cache_hits_counter: Gauge::new(),
request_nblocks_counters: GaugeVec::new(),
allocator_metrics: MyAllocatorCollector::new(),
};
let worker_struct = Box::leak(Box::new(worker_struct));
let main_loop_handle = worker_struct.runtime.spawn(worker_struct.run());
worker_struct.runtime.spawn(async {
let err = main_loop_handle.await.unwrap_err();
error!("error: {err:?}");
});
// Start the listener on the control socket
worker_struct
.runtime
@@ -188,577 +55,12 @@ pub(super) fn init(
Ok(worker_struct)
}
impl<'t> CommunicatorWorkerProcessStruct<'t> {
/// Update the configuration
pub(super) fn update_shard_map(
&self,
new_shard_map: HashMap<utils::shard::ShardIndex, String>,
stripe_size: Option<ShardStripeSize>,
) {
let client = self.client.as_ref().unwrap();
let shard_spec = ShardSpec::new(new_shard_map, stripe_size).expect("invalid shard spec");
{
let _in_runtime = self.runtime.enter();
if let Err(err) = client.update_shards(shard_spec) {
tracing::error!("could not update shard map: {err:?}");
}
}
}
/// Main loop of the worker process. Receive requests from the backends and process them.
pub(super) async fn run(&'static self) {
let mut idxbuf: [u8; 4] = [0; 4];
let mut submission_pipe_read =
PipeRead::try_from(self.submission_pipe_read_fd.as_raw_fd()).expect("invalid pipe fd");
loop {
// Wait for a backend to ring the doorbell
match submission_pipe_read.read(&mut idxbuf).await {
Ok(4) => {}
Ok(nbytes) => panic!("short read ({nbytes} bytes) on communicator pipe"),
Err(e) => panic!("error reading from communicator pipe: {e}"),
}
let slot_idx = u32::from_ne_bytes(idxbuf) as usize;
// Read the IO request from the slot indicated in the wakeup
let Some(slot) = self.neon_request_slots[slot_idx].start_processing_request() else {
// This currently should not happen. But if we had multiple threads picking up
// requests, and without waiting for the notifications, it could.
panic!("no request in slot");
};
// Ok, we have ownership of this request now. We must process it now, there's no going
// back.
//
// Spawn a separate task for every request. That's a little excessive for requests that
// can be quickly satisfied from the cache, but we expect that to be rare, because the
// requesting backend would have already checked the cache.
tokio::spawn(async move {
use tracing::Instrument;
let request_id = slot.get_request().request_id();
let owner_procno = slot.get_owner_procno();
let span = info_span!(
"processing",
request_id = request_id,
slot_idx = slot_idx,
procno = owner_procno,
);
async {
// FIXME: as a temporary hack, abort the request if we don't get a response
// promptly.
//
// Lots of regression tests are getting stuck and failing at the moment,
// this makes them fail a little faster, which it faster to iterate.
// This needs to be removed once more regression tests are passing.
// See also similar hack in the backend code, in wait_request_completion()
let result = tokio::time::timeout(
tokio::time::Duration::from_secs(30),
self.handle_request(slot.get_request()),
)
.await
.unwrap_or_else(|_elapsed| {
info!("request {request_id} timed out");
NeonIOResult::Error(libc::ETIMEDOUT)
});
trace!("request {request_id} at slot {slot_idx} completed");
// Ok, we have completed the IO. Mark the request as completed. After that,
// we no longer have ownership of the slot, and must not modify it.
slot.completed(result);
// Notify the backend about the completion. (Note that the backend might see
// the completed status even before this; this is just a wakeup)
notify_proc(owner_procno);
}
.instrument(span)
.await
});
}
}
/// Compute the 'request_lsn' to use for a pageserver request
fn request_lsns(&self, not_modified_since_lsn: Lsn) -> page_api::ReadLsn {
let mut request_lsn = get_request_lsn();
// Is it possible that the last-written LSN is ahead of last flush LSN? Generally not, we
// shouldn't evict a page from the buffer cache before all its modifications have been
// safely flushed. That's the "WAL before data" rule. However, there are a few exceptions:
//
// - when creation an index: _bt_blwritepage logs the full page without flushing WAL before
// smgrextend (files are fsynced before build ends).
//
// XXX: If we make a request LSN greater than the current WAL flush LSN, the pageserver would
// block waiting for the WAL arrive, until we flush it and it propagates through the
// safekeepers to the pageserver. If there's nothing that forces the WAL to be flushed,
// the pageserver would get stuck waiting forever. To avoid that, all the write-
// functions in communicator_new.c call XLogSetAsyncXactLSN(). That nudges the WAL writer to
// perform the flush relatively soon.
//
// It would perhaps be nicer to do the WAL flush here, but it's tricky to call back into
// Postgres code to do that from here. That's why we rely on communicator_new.c to do the
// calls "pre-emptively".
//
// FIXME: Because of the above, it can still happen that the flush LSN is ahead of
// not_modified_since, if the WAL writer hasn't done the flush yet. It would be nice to know
// if there are other cases like that that we have mised, but unfortunately we cannot turn
// this into an assertion because of that legit case.
//
// See also the old logic in neon_get_request_lsns() C function
if not_modified_since_lsn > request_lsn {
tracing::info!(
"not_modified_since_lsn {} is ahead of last flushed LSN {}",
not_modified_since_lsn,
request_lsn
);
request_lsn = not_modified_since_lsn;
}
page_api::ReadLsn {
request_lsn,
not_modified_since_lsn: Some(not_modified_since_lsn),
}
}
/// Handle one IO request
async fn handle_request(&'static self, request: &'_ NeonIORequest) -> NeonIOResult {
let client = self
.client
.as_ref()
.expect("cannot handle requests without client");
self.request_counters
.inc(RequestTypeLabelGroup::from_req(request));
match request {
NeonIORequest::Empty => {
error!("unexpected Empty IO request");
NeonIOResult::Error(0)
}
NeonIORequest::RelSize(req) => {
let rel = req.reltag();
let _in_progress_guard = self
.in_progress_table
.lock(RequestInProgressKey::Rel(rel), req.request_id)
.await;
// Check the cache first
let not_modified_since = match self.cache.get_rel_size(&rel) {
CacheResult::Found(nblocks) => {
tracing::trace!("found relsize for {:?} in cache: {}", rel, nblocks);
return NeonIOResult::RelSize(nblocks);
}
// XXX: we don't cache negative entries, so if there's no entry in the cache, it could mean
// that the relation doesn't exist or that we don't have it cached.
CacheResult::NotFound(lsn) => lsn,
};
let read_lsn = self.request_lsns(not_modified_since);
match client
.get_rel_size(page_api::GetRelSizeRequest {
read_lsn,
rel,
allow_missing: req.allow_missing,
})
.await
{
Ok(Some(nblocks)) => {
// update the cache
tracing::info!(
"updated relsize for {:?} in cache: {}, lsn {}",
rel,
nblocks,
read_lsn
);
self.cache
.remember_rel_size(&rel, nblocks, not_modified_since);
NeonIOResult::RelSize(nblocks)
}
Ok(None) => {
// TODO: cache negative entry?
NeonIOResult::RelSize(INVALID_BLOCK_NUMBER)
}
Err(err) => {
info!("tonic error: {err:?}");
NeonIOResult::Error(0)
}
}
}
NeonIORequest::GetPageV(req) => match self.handle_get_pagev_request(req).await {
Ok(()) => NeonIOResult::GetPageV,
Err(errno) => NeonIOResult::Error(errno),
},
NeonIORequest::ReadSlruSegment(req) => {
let lsn = Lsn(req.request_lsn);
let file_path = req.destination_file_path();
match client
.get_slru_segment(page_api::GetSlruSegmentRequest {
read_lsn: self.request_lsns(lsn),
kind: req.slru_kind,
segno: req.segment_number,
})
.await
{
Ok(slru_bytes) => {
if let Err(e) = tokio::fs::write(&file_path, &slru_bytes).await {
info!("could not write slru segment to file {file_path}: {e}");
return NeonIOResult::Error(e.raw_os_error().unwrap_or(libc::EIO));
}
let blocks_count = slru_bytes.len() / crate::BLCKSZ;
NeonIOResult::ReadSlruSegment(blocks_count as _)
}
Err(err) => {
info!("tonic error: {err:?}");
NeonIOResult::Error(0)
}
}
}
NeonIORequest::PrefetchV(req) => {
self.request_nblocks_counters
.inc_by(RequestTypeLabelGroup::from_req(request), req.nblocks as i64);
let req = *req;
tokio::spawn(async move { self.handle_prefetchv_request(&req).await });
NeonIOResult::PrefetchVLaunched
}
NeonIORequest::DbSize(req) => {
let _in_progress_guard = self
.in_progress_table
.lock(RequestInProgressKey::Db(req.db_oid), req.request_id)
.await;
// Check the cache first
let not_modified_since = match self.cache.get_db_size(req.db_oid) {
CacheResult::Found(db_size) => {
// get_page already copied the block content to the destination
return NeonIOResult::DbSize(db_size);
}
CacheResult::NotFound(lsn) => lsn,
};
match client
.get_db_size(page_api::GetDbSizeRequest {
read_lsn: self.request_lsns(not_modified_since),
db_oid: req.db_oid,
})
.await
{
Ok(db_size) => NeonIOResult::DbSize(db_size),
Err(err) => {
info!("tonic error: {err:?}");
NeonIOResult::Error(0)
}
}
}
// Write requests
NeonIORequest::WritePage(req) => {
let rel = req.reltag();
let _in_progress_guard = self
.in_progress_table
.lock(
RequestInProgressKey::Block(rel, req.block_number),
req.request_id,
)
.await;
// We must at least update the last-written LSN on the page, but also store the page
// image in the LFC while we still have it
self.cache
.remember_page(&rel, req.block_number, req.src, Lsn(req.lsn), true)
.await;
NeonIOResult::WriteOK
}
NeonIORequest::RelExtend(req) => {
let rel = req.reltag();
let _in_progress_guard = self
.in_progress_table
.lock(
RequestInProgressKey::Block(rel, req.block_number),
req.request_id,
)
.await;
// We must at least update the last-written LSN on the page and the relation size,
// but also store the page image in the LFC while we still have it
self.cache
.remember_page(&rel, req.block_number, req.src, Lsn(req.lsn), true)
.await;
self.cache
.remember_rel_size(&req.reltag(), req.block_number + 1, Lsn(req.lsn));
NeonIOResult::WriteOK
}
NeonIORequest::RelZeroExtend(req) => {
self.request_nblocks_counters
.inc_by(RequestTypeLabelGroup::from_req(request), req.nblocks as i64);
// TODO: need to grab an io-in-progress lock for this? I guess not
// TODO: We could put the empty pages to the cache. Maybe have
// a marker on the block entries for all-zero pages, instead of
// actually storing the empty pages.
self.cache.remember_rel_size(
&req.reltag(),
req.block_number + req.nblocks,
Lsn(req.lsn),
);
NeonIOResult::WriteOK
}
NeonIORequest::RelCreate(req) => {
// TODO: need to grab an io-in-progress lock for this? I guess not
self.cache.remember_rel_size(&req.reltag(), 0, Lsn(req.lsn));
NeonIOResult::WriteOK
}
NeonIORequest::RelTruncate(req) => {
// TODO: need to grab an io-in-progress lock for this? I guess not
self.cache
.remember_rel_size(&req.reltag(), req.nblocks, Lsn(req.lsn));
NeonIOResult::WriteOK
}
NeonIORequest::RelUnlink(req) => {
// TODO: need to grab an io-in-progress lock for this? I guess not
self.cache.forget_rel(&req.reltag(), None, Lsn(req.lsn));
NeonIOResult::WriteOK
}
NeonIORequest::UpdateCachedRelSize(req) => {
// TODO: need to grab an io-in-progress lock for this? I guess not
self.cache
.remember_rel_size(&req.reltag(), req.nblocks, Lsn(req.lsn));
NeonIOResult::WriteOK
}
}
}
/// Subroutine to handle a GetPageV request, since it's a little more complicated than
/// others.
async fn handle_get_pagev_request(&'t self, req: &CGetPageVRequest) -> Result<(), i32> {
let client = self
.client
.as_ref()
.expect("cannot handle requests without client");
let rel = req.reltag();
// Check the cache first
//
// Note: Because the backends perform a direct lookup in the cache before sending
// the request to the communicator process, we expect the pages to almost never
// be already in cache. It could happen if:
// 1. two backends try to read the same page at the same time, but that should never
// happen because there's higher level locking in the Postgres buffer manager, or
// 2. a prefetch request finished at the same time as a backend requested the
// page. That's much more likely.
let mut cache_misses = Vec::with_capacity(req.nblocks as usize);
for i in 0..req.nblocks {
let blkno = req.block_number + i as u32;
// note: this is deadlock-safe even though we hold multiple locks at the same time,
// because they're always acquired in the same order.
let in_progress_guard = self
.in_progress_table
.lock(RequestInProgressKey::Block(rel, blkno), req.request_id)
.await;
let dest = req.dest[i as usize];
let not_modified_since = match self.cache.get_page(&rel, blkno, dest).await {
Ok(CacheResult::Found(_)) => {
// get_page already copied the block content to the destination
trace!("found blk {} in rel {:?} in LFC", blkno, rel);
continue;
}
Ok(CacheResult::NotFound(lsn)) => lsn,
Err(_io_error) => return Err(-1), // FIXME errno?
};
cache_misses.push((blkno, not_modified_since, dest, in_progress_guard));
}
self.getpage_cache_misses_counter
.inc_by(cache_misses.len() as i64);
self.getpage_cache_hits_counter
.inc_by(req.nblocks as i64 - cache_misses.len() as i64);
if cache_misses.is_empty() {
return Ok(());
}
let not_modified_since = cache_misses
.iter()
.map(|(_blkno, lsn, _dest, _guard)| *lsn)
.max()
.unwrap();
// Construct a pageserver request for the cache misses
let block_numbers: Vec<u32> = cache_misses
.iter()
.map(|(blkno, _lsn, _dest, _guard)| *blkno)
.collect();
let read_lsn = self.request_lsns(not_modified_since);
info!(
"sending getpage request for blocks {:?} in rel {:?} lsns {}",
block_numbers, rel, read_lsn
);
match client
.get_page(page_api::GetPageRequest {
request_id: req.request_id.into(),
request_class: page_api::GetPageClass::Normal,
read_lsn,
rel,
block_numbers: block_numbers.clone(),
})
.await
{
Ok(resp) => {
// Write the received page images directly to the shared memory location
// that the backend requested.
if resp.pages.len() != block_numbers.len() {
error!(
"received unexpected response with {} page images from pageserver for a request for {} pages",
resp.pages.len(),
block_numbers.len(),
);
return Err(-1);
}
info!(
"received getpage response for blocks {:?} in rel {:?} lsns {}",
block_numbers, rel, read_lsn
);
for (page, (blkno, _lsn, dest, _guard)) in resp.pages.into_iter().zip(cache_misses)
{
let src: &[u8] = page.image.as_ref();
let len = std::cmp::min(src.len(), dest.bytes_total());
unsafe {
std::ptr::copy_nonoverlapping(src.as_ptr(), dest.as_mut_ptr(), len);
};
// Also store it in the LFC while we have it
self.cache
.remember_page(
&rel,
blkno,
page.image,
read_lsn.not_modified_since_lsn.unwrap(),
false,
)
.await;
}
}
Err(err) => {
info!("tonic error: {err:?}");
return Err(-1);
}
}
Ok(())
}
/// Subroutine to handle a PrefetchV request, since it's a little more complicated than
/// others.
///
/// This is very similar to a GetPageV request, but the results are only stored in the cache.
async fn handle_prefetchv_request(&'static self, req: &CPrefetchVRequest) -> Result<(), i32> {
let client = self
.client
.as_ref()
.expect("cannot handle requests without client");
let rel = req.reltag();
// Check the cache first
let mut cache_misses = Vec::with_capacity(req.nblocks as usize);
for i in 0..req.nblocks {
let blkno = req.block_number + i as u32;
// note: this is deadlock-safe even though we hold multiple locks at the same time,
// because they're always acquired in the same order.
let in_progress_guard = self
.in_progress_table
.lock(RequestInProgressKey::Block(rel, blkno), req.request_id)
.await;
let not_modified_since = match self.cache.page_is_cached(&rel, blkno).await {
Ok(CacheResult::Found(_)) => {
trace!("found blk {} in rel {:?} in LFC", blkno, rel);
continue;
}
Ok(CacheResult::NotFound(lsn)) => lsn,
Err(_io_error) => return Err(-1), // FIXME errno?
};
cache_misses.push((blkno, not_modified_since, in_progress_guard));
}
if cache_misses.is_empty() {
return Ok(());
}
let not_modified_since = cache_misses
.iter()
.map(|(_blkno, lsn, _guard)| *lsn)
.max()
.unwrap();
let block_numbers: Vec<u32> = cache_misses
.iter()
.map(|(blkno, _lsn, _guard)| *blkno)
.collect();
// TODO: spawn separate tasks for these. Use the integrated cache to keep track of the
// in-flight requests
match client
.get_page(page_api::GetPageRequest {
request_id: req.request_id.into(),
request_class: page_api::GetPageClass::Prefetch,
read_lsn: self.request_lsns(not_modified_since),
rel,
block_numbers: block_numbers.clone(),
})
.await
{
Ok(resp) => {
trace!(
"prefetch completed, remembering blocks {:?} in rel {:?} in LFC",
block_numbers, rel
);
if resp.pages.len() != block_numbers.len() {
error!(
"received unexpected response with {} page images from pageserver for a request for {} pages",
resp.pages.len(),
block_numbers.len(),
);
return Err(-1);
}
for (page, (blkno, _lsn, _guard)) in resp.pages.into_iter().zip(cache_misses) {
self.cache
.remember_page(&rel, blkno, page.image, not_modified_since, false)
.await;
}
}
Err(err) => {
info!("tonic error: {err:?}");
return Err(-1);
}
}
Ok(())
}
}
impl<T> MetricGroup<T> for CommunicatorWorkerProcessStruct<'_>
impl<T> MetricGroup<T> for CommunicatorWorkerProcessStruct
where
T: Encoding,
GaugeState: MetricEncoding<T>,
{
fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
use measured::metric::MetricFamilyEncoding;
use measured::metric::name::MetricName;
self.lfc_metrics.collect_group_into(enc)?;
self.request_counters
.collect_family_into(MetricName::from_str("request_counters"), enc)?;
self.request_nblocks_counters
.collect_family_into(MetricName::from_str("request_nblocks_counters"), enc)?;
// FIXME: allocator metrics
Ok(())
self.lfc_metrics.collect_group_into(enc)
}
}

View File

@@ -4,9 +4,9 @@
//! - launch the main loop,
//! - receive IO requests from backends and process them,
//! - write results back to backends.
mod callbacks;
mod control_socket;
mod in_progress_ios;
mod lfc_metrics;
mod logging;
mod main_loop;

View File

@@ -1,21 +1,14 @@
//! Functions called from the C code in the worker process
use std::collections::HashMap;
use std::ffi::{CStr, CString, c_char};
use std::path::PathBuf;
use crate::init::CommunicatorInitStruct;
use crate::worker_process::main_loop;
use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
use pageserver_client_grpc::ShardStripeSize;
/// Launch the communicator's tokio tasks, which do most of the work.
///
/// The caller has initialized the process as a regular PostgreSQL background worker
/// process. The shared memory segment used to communicate with the backends has been
/// allocated and initialized earlier, at postmaster startup, in
/// rcommunicator_shmem_init().
/// process.
///
/// Inputs:
/// `tenant_id` and `timeline_id` can be NULL, if we're been launched in "non-Neon" mode,
@@ -30,19 +23,11 @@ use pageserver_client_grpc::ShardStripeSize;
/// This is called only once in the process, so the returned struct, and error message in
/// case of failure, are simply leaked.
#[unsafe(no_mangle)]
pub extern "C" fn communicator_worker_process_launch(
cis: Box<CommunicatorInitStruct>,
pub extern "C" fn communicator_worker_launch(
tenant_id: *const c_char,
timeline_id: *const c_char,
auth_token: *const c_char,
shard_map: *mut *mut c_char,
nshards: u32,
stripe_size: u32,
file_cache_path: *const c_char,
initial_file_cache_size: u64,
error_p: *mut *const c_char,
) -> Option<&'static CommunicatorWorkerProcessStruct<'static>> {
tracing::warn!("starting threads in rust code");
) -> Option<&'static CommunicatorWorkerProcessStruct> {
// Convert the arguments into more convenient Rust types
let tenant_id = if tenant_id.is_null() {
None
@@ -56,41 +41,9 @@ pub extern "C" fn communicator_worker_process_launch(
let cstr = unsafe { CStr::from_ptr(timeline_id) };
Some(cstr.to_str().expect("assume UTF-8"))
};
let auth_token = if auth_token.is_null() {
None
} else {
let cstr = unsafe { CStr::from_ptr(auth_token) };
Some(cstr.to_str().expect("assume UTF-8"))
};
let file_cache_path = {
if file_cache_path.is_null() {
None
} else {
let c_str = unsafe { CStr::from_ptr(file_cache_path) };
Some(PathBuf::from(c_str.to_str().unwrap()))
}
};
let shard_map = shard_map_to_hash(nshards, shard_map);
// FIXME: distinguish between unsharded, and sharded with 1 shard
// Also, we might go from unsharded to sharded while the system
// is running.
let stripe_size = if stripe_size > 0 && nshards > 1 {
Some(ShardStripeSize(stripe_size))
} else {
None
};
// The `init` function does all the work.
let result = main_loop::init(
*cis,
tenant_id,
timeline_id,
auth_token,
shard_map,
stripe_size,
initial_file_cache_size,
file_cache_path,
);
let result = main_loop::init(tenant_id, timeline_id);
// On failure, return the error message to the C caller in *error_p.
match result {
@@ -105,47 +58,3 @@ pub extern "C" fn communicator_worker_process_launch(
}
}
}
/// Convert the "shard map" from an array of C strings, indexed by shard no to a rust HashMap
fn shard_map_to_hash(
nshards: u32,
shard_map: *mut *mut c_char,
) -> HashMap<utils::shard::ShardIndex, String> {
use utils::shard::*;
assert!(nshards <= u8::MAX as u32);
let mut result: HashMap<ShardIndex, String> = HashMap::new();
let mut p = shard_map;
for i in 0..nshards {
let c_str = unsafe { CStr::from_ptr(*p) };
p = unsafe { p.add(1) };
let s = c_str.to_str().unwrap();
let k = if nshards > 1 {
ShardIndex::new(ShardNumber(i as u8), ShardCount(nshards as u8))
} else {
ShardIndex::unsharded()
};
result.insert(k, s.into());
}
result
}
/// Inform the rust code about a configuration change
#[unsafe(no_mangle)]
pub extern "C" fn communicator_worker_config_reload(
proc_handle: &'static CommunicatorWorkerProcessStruct<'static>,
file_cache_size: u64,
shard_map: *mut *mut c_char,
nshards: u32,
stripe_size: u32,
) {
proc_handle.cache.resize_file_cache(file_cache_size as u32);
let shard_map = shard_map_to_hash(nshards, shard_map);
let stripe_size = (nshards > 1).then_some(ShardStripeSize(stripe_size));
proc_handle.update_shard_map(shard_map, stripe_size);
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,66 +0,0 @@
/*-------------------------------------------------------------------------
*
* communicator_new.h
* new implementation
*
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*-------------------------------------------------------------------------
*/
#ifndef COMMUNICATOR_NEW_H
#define COMMUNICATOR_NEW_H
#include "storage/buf_internals.h"
#include "lfc_prewarm.h"
#include "neon.h"
#include "neon_pgversioncompat.h"
#include "pagestore_client.h"
/* initialization at postmaster startup */
extern void CommunicatorNewShmemRequest(void);
extern void CommunicatorNewShmemInit(void);
/* initialization at backend startup */
extern void communicator_new_init(void);
/* Read requests */
extern bool communicator_new_rel_exists(NRelFileInfo rinfo, ForkNumber forkNum);
extern BlockNumber communicator_new_rel_nblocks(NRelFileInfo rinfo, ForkNumber forknum);
extern int64 communicator_new_dbsize(Oid dbNode);
extern void communicator_new_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum,
BlockNumber base_blockno,
void **buffers, BlockNumber nblocks);
extern void communicator_new_prefetch_register_bufferv(NRelFileInfo rinfo, ForkNumber forkNum,
BlockNumber blockno,
BlockNumber nblocks);
extern bool communicator_new_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum,
BlockNumber blockno);
extern int communicator_new_read_slru_segment(
SlruKind kind,
uint32_t segno,
neon_request_lsns *request_lsns,
const char *path
);
/* Write requests, to keep the caches up-to-date */
extern void communicator_new_write_page(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
const void *buffer, XLogRecPtr lsn);
extern void communicator_new_rel_extend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
const void *buffer, XLogRecPtr lsn);
extern void communicator_new_rel_zeroextend(NRelFileInfo rinfo, ForkNumber forkNum,
BlockNumber blockno, BlockNumber nblocks,
XLogRecPtr lsn);
extern void communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum, XLogRecPtr lsn);
extern void communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks, XLogRecPtr lsn);
extern void communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum, XLogRecPtr lsn);
extern void communicator_new_update_cached_rel_size(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks, XLogRecPtr lsn);
/* other functions */
extern int32 communicator_new_approximate_working_set_size_seconds(time_t duration, bool reset);
extern FileCacheState *communicator_new_get_lfc_state(size_t max_entries);
extern LfcStatsEntry *communicator_new_get_lfc_stats(void);
#endif /* COMMUNICATOR_NEW_H */

View File

@@ -18,9 +18,6 @@
#include <unistd.h>
#include "miscadmin.h"
#if PG_VERSION_NUM >= 150000
#include "access/xlogrecovery.h"
#endif
#include "postmaster/bgworker.h"
#include "postmaster/interrupt.h"
#include "postmaster/postmaster.h"
@@ -36,13 +33,10 @@
#include "file_cache.h"
#include "neon.h"
#include "neon_perf_counters.h"
#include "pagestore_client.h"
/* the rust bindings, generated by cbindgen */
#include "communicator/communicator_bindings.h"
struct CommunicatorInitStruct *cis;
static void pump_logging(struct LoggingReceiver *logging);
PGDLLEXPORT void communicator_new_bgworker_main(Datum main_arg);
@@ -76,9 +70,6 @@ pg_init_communicator_process(void)
void
communicator_new_bgworker_main(Datum main_arg)
{
char **connstrings;
ShardMap shard_map;
uint64 file_cache_size;
struct LoggingReceiver *logging;
const char *errmsg = NULL;
const struct CommunicatorWorkerProcessStruct *proc_handle;
@@ -103,20 +94,6 @@ communicator_new_bgworker_main(Datum main_arg)
BackgroundWorkerUnblockSignals();
/* lfc_size_limit is in MBs */
file_cache_size = lfc_size_limit * (1024 * 1024 / BLCKSZ);
if (file_cache_size < 100)
file_cache_size = 100;
if (!parse_shard_map(pageserver_grpc_urls, &shard_map))
{
/* shouldn't happen, as the GUC was verified already */
elog(FATAL, "could not parse neon.pageserver_grpcs_urls");
}
connstrings = palloc(shard_map.num_shards * sizeof(char *));
for (int i = 0; i < shard_map.num_shards; i++)
connstrings[i] = shard_map.connstring[i];
/*
* By default, INFO messages are not printed to the log. We want
* `tracing::info!` messages emitted from the communicator to be printed,
@@ -131,20 +108,11 @@ communicator_new_bgworker_main(Datum main_arg)
logging = communicator_worker_configure_logging();
Assert(cis != NULL);
proc_handle = communicator_worker_process_launch(
cis,
neon_tenant[0] == '\0' ? NULL : neon_tenant,
neon_timeline[0] == '\0' ? NULL : neon_timeline,
neon_auth_token,
connstrings,
shard_map.num_shards,
neon_stripe_size,
lfc_path,
file_cache_size,
&errmsg);
pfree(connstrings);
cis = NULL;
proc_handle = communicator_worker_launch(
neon_tenant[0] == '\0' ? NULL : neon_tenant,
neon_timeline[0] == '\0' ? NULL : neon_timeline,
&errmsg
);
if (proc_handle == NULL)
{
/*
@@ -205,28 +173,6 @@ communicator_new_bgworker_main(Datum main_arg)
{
ConfigReloadPending = false;
ProcessConfigFile(PGC_SIGHUP);
/* lfc_size_limit is in MBs */
file_cache_size = lfc_size_limit * (1024 * 1024 / BLCKSZ);
if (file_cache_size < 100)
file_cache_size = 100;
/* Reload pageserver URLs */
if (!parse_shard_map(pageserver_grpc_urls, &shard_map))
{
/* shouldn't happen, as the GUC was verified already */
elog(FATAL, "could not parse neon.pageserver_grpcs_urls");
}
connstrings = palloc(shard_map.num_shards * sizeof(char *));
for (int i = 0; i < shard_map.num_shards; i++)
connstrings[i] = shard_map.connstring[i];
communicator_worker_config_reload(proc_handle,
file_cache_size,
connstrings,
shard_map.num_shards,
neon_stripe_size);
pfree(connstrings);
}
duration = TimestampDifferenceMilliseconds(before, GetCurrentTimestamp());
@@ -325,36 +271,3 @@ callback_set_my_latch_unsafe(void)
{
SetLatch(MyLatch);
}
/*
* FIXME: The logic from neon_get_request_lsns() needs to go here, except for
* the last-written LSN cache stuff, which is managed by the rust code now.
*/
XLogRecPtr
callback_get_request_lsn_unsafe(void)
{
/*
* NB: be very careful with what you do here! This is called from tokio
* threads, so anything tha tries to take LWLocks is unsafe, for example.
*
* RecoveryInProgress() is OK
*/
if (RecoveryInProgress())
{
XLogRecPtr replay_lsn = GetXLogReplayRecPtr(NULL);
return replay_lsn;
}
else
{
XLogRecPtr flushlsn;
#if PG_VERSION_NUM >= 150000
flushlsn = GetFlushRecPtr(NULL);
#else
flushlsn = GetFlushRecPtr();
#endif
return flushlsn;
}
}

View File

@@ -12,9 +12,6 @@
#ifndef COMMUNICATOR_PROCESS_H
#define COMMUNICATOR_PROCESS_H
extern struct CommunicatorInitStruct *cis;
/* initialization early at postmaster startup */
extern void pg_init_communicator_process(void);
#endif /* COMMUNICATOR_PROCESS_H */

View File

@@ -136,6 +136,15 @@ typedef struct FileCacheEntry
#define N_COND_VARS 64
#define CV_WAIT_TIMEOUT 10
#define MAX_PREWARM_WORKERS 8
typedef struct PrewarmWorkerState
{
uint32 prewarmed_pages;
uint32 skipped_pages;
TimestampTz completed;
} PrewarmWorkerState;
typedef struct FileCacheControl
{
uint64 generation; /* generation is needed to handle correct hash
@@ -181,27 +190,47 @@ typedef struct FileCacheControl
* again.
*/
HyperLogLogState wss_estimation;
/* Prewarmer state */
PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS];
size_t n_prewarm_workers;
size_t n_prewarm_entries;
size_t total_prewarm_pages;
size_t prewarm_batch;
bool prewarm_active;
bool prewarm_canceled;
dsm_handle prewarm_lfc_state_handle;
} FileCacheControl;
#define FILE_CACHE_STATE_MAGIC 0xfcfcfcfc
#define FILE_CACHE_STATE_BITMAP(fcs) ((uint8*)&(fcs)->chunks[(fcs)->n_chunks])
#define FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_chunks) (sizeof(FileCacheState) + (n_chunks)*sizeof(BufferTag) + (((n_chunks) * lfc_blocks_per_chunk)+7)/8)
#define FILE_CACHE_STATE_SIZE(fcs) (sizeof(FileCacheState) + (fcs->n_chunks)*sizeof(BufferTag) + (((fcs->n_chunks) << fcs->chunk_size_log)+7)/8)
static HTAB *lfc_hash;
static int lfc_desc = -1;
static LWLockId lfc_lock;
int lfc_max_size;
int lfc_size_limit;
static int lfc_max_size;
static int lfc_size_limit;
static int lfc_prewarm_limit;
static int lfc_prewarm_batch;
static int lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG;
static int lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK;
char *lfc_path;
static char *lfc_path;
static uint64 lfc_generation;
static FileCacheControl *lfc_ctl;
static bool lfc_do_prewarm;
bool lfc_store_prefetch_result;
bool lfc_prewarm_update_ws_estimation;
bool lfc_do_prewarm;
bool lfc_prewarm_cancel;
bool AmPrewarmWorker;
#define LFC_ENABLED() (lfc_ctl->limit != 0)
PGDLLEXPORT void lfc_prewarm_main(Datum main_arg);
/*
* Close LFC file if opened.
* All backends should close their LFC files once LFC is disabled.
@@ -227,8 +256,6 @@ lfc_switch_off(void)
{
int fd;
Assert(!neon_use_communicator_worker);
if (LFC_ENABLED())
{
HASH_SEQ_STATUS status;
@@ -294,8 +321,6 @@ lfc_maybe_disabled(void)
static bool
lfc_ensure_opened(void)
{
Assert(!neon_use_communicator_worker);
if (lfc_generation != lfc_ctl->generation)
{
lfc_close_file();
@@ -321,9 +346,6 @@ LfcShmemInit(void)
bool found;
static HASHCTL info;
if (neon_use_communicator_worker)
return;
if (lfc_max_size <= 0)
return;
@@ -513,6 +535,7 @@ lfc_init(void)
if (!process_shared_preload_libraries_in_progress)
neon_log(ERROR, "Neon module should be loaded via shared_preload_libraries");
DefineCustomBoolVariable("neon.store_prefetch_result_in_lfc",
"Immediately store received prefetch result in LFC",
NULL,
@@ -584,13 +607,34 @@ lfc_init(void)
lfc_check_chunk_size,
lfc_change_chunk_size,
NULL);
DefineCustomIntVariable("neon.file_cache_prewarm_limit",
"Maximal number of prewarmed chunks",
NULL,
&lfc_prewarm_limit,
INT_MAX, /* no limit by default */
0,
INT_MAX,
PGC_SIGHUP,
0,
NULL,
NULL,
NULL);
DefineCustomIntVariable("neon.file_cache_prewarm_batch",
"Number of pages retrivied by prewarm from page server",
NULL,
&lfc_prewarm_batch,
64,
1,
INT_MAX,
PGC_SIGHUP,
0,
NULL,
NULL,
NULL);
}
/*
* Dump a list of pages that are currently in the LFC
*
* This is used to get a snapshot that can be used to prewarm the LFC later.
*/
FileCacheState*
lfc_get_state(size_t max_entries)
{
@@ -608,7 +652,7 @@ lfc_get_state(size_t max_entries)
uint8* bitmap;
size_t n_pages = 0;
size_t n_entries = Min(max_entries, lfc_ctl->used - lfc_ctl->pinned);
size_t state_size = FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_entries, lfc_blocks_per_chunk);
size_t state_size = FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_entries);
fcs = (FileCacheState*)palloc0(state_size);
SET_VARSIZE(fcs, state_size);
fcs->magic = FILE_CACHE_STATE_MAGIC;
@@ -642,6 +686,267 @@ lfc_get_state(size_t max_entries)
return fcs;
}
/*
* Prewarm LFC cache to the specified state. It uses lfc_prefetch function to load prewarmed page without hoilding shared buffer lock
* and avoid race conditions with other backends.
*/
void
lfc_prewarm(FileCacheState* fcs, uint32 n_workers)
{
size_t fcs_chunk_size_log;
size_t n_entries;
size_t prewarm_batch = Min(lfc_prewarm_batch, readahead_buffer_size);
size_t fcs_size;
dsm_segment *seg;
BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS];
if (!lfc_ensure_opened())
return;
if (prewarm_batch == 0 || lfc_prewarm_limit == 0 || n_workers == 0)
{
elog(LOG, "LFC: prewarm is disabled");
return;
}
if (n_workers > MAX_PREWARM_WORKERS)
{
elog(ERROR, "LFC: Too much prewarm workers, maximum is %d", MAX_PREWARM_WORKERS);
}
if (fcs == NULL || fcs->n_chunks == 0)
{
elog(LOG, "LFC: nothing to prewarm");
return;
}
if (fcs->magic != FILE_CACHE_STATE_MAGIC)
{
elog(ERROR, "LFC: Invalid file cache state magic: %X", fcs->magic);
}
fcs_size = VARSIZE(fcs);
if (FILE_CACHE_STATE_SIZE(fcs) != fcs_size)
{
elog(ERROR, "LFC: Invalid file cache state size: %u vs. %u", (unsigned)FILE_CACHE_STATE_SIZE(fcs), VARSIZE(fcs));
}
fcs_chunk_size_log = fcs->chunk_size_log;
if (fcs_chunk_size_log > MAX_BLOCKS_PER_CHUNK_LOG)
{
elog(ERROR, "LFC: Invalid chunk size log: %u", fcs->chunk_size_log);
}
n_entries = Min(fcs->n_chunks, lfc_prewarm_limit);
Assert(n_entries != 0);
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
/* Do not prewarm more entries than LFC limit */
if (lfc_ctl->limit <= lfc_ctl->size)
{
elog(LOG, "LFC: skip prewarm because LFC is already filled");
LWLockRelease(lfc_lock);
return;
}
if (lfc_ctl->prewarm_active)
{
LWLockRelease(lfc_lock);
elog(ERROR, "LFC: skip prewarm because another prewarm is still active");
}
lfc_ctl->n_prewarm_entries = n_entries;
lfc_ctl->n_prewarm_workers = n_workers;
lfc_ctl->prewarm_active = true;
lfc_ctl->prewarm_canceled = false;
lfc_ctl->prewarm_batch = prewarm_batch;
memset(lfc_ctl->prewarm_workers, 0, n_workers*sizeof(PrewarmWorkerState));
LWLockRelease(lfc_lock);
/* Calculate total number of pages to be prewarmed */
lfc_ctl->total_prewarm_pages = fcs->n_pages;
seg = dsm_create(fcs_size, 0);
memcpy(dsm_segment_address(seg), fcs, fcs_size);
lfc_ctl->prewarm_lfc_state_handle = dsm_segment_handle(seg);
/* Spawn background workers */
for (uint32 i = 0; i < n_workers; i++)
{
BackgroundWorker worker = {0};
worker.bgw_flags = BGWORKER_SHMEM_ACCESS;
worker.bgw_start_time = BgWorkerStart_ConsistentState;
worker.bgw_restart_time = BGW_NEVER_RESTART;
strcpy(worker.bgw_library_name, "neon");
strcpy(worker.bgw_function_name, "lfc_prewarm_main");
snprintf(worker.bgw_name, BGW_MAXLEN, "LFC prewarm worker %d", i+1);
strcpy(worker.bgw_type, "LFC prewarm worker");
worker.bgw_main_arg = Int32GetDatum(i);
/* must set notify PID to wait for shutdown */
worker.bgw_notify_pid = MyProcPid;
if (!RegisterDynamicBackgroundWorker(&worker, &bgw_handle[i]))
{
ereport(LOG,
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
errmsg("LFC: registering dynamic bgworker prewarm failed"),
errhint("Consider increasing the configuration parameter \"%s\".", "max_worker_processes")));
n_workers = i;
lfc_ctl->prewarm_canceled = true;
break;
}
}
for (uint32 i = 0; i < n_workers; i++)
{
bool interrupted;
do
{
interrupted = false;
PG_TRY();
{
BgwHandleStatus status = WaitForBackgroundWorkerShutdown(bgw_handle[i]);
if (status != BGWH_STOPPED && status != BGWH_POSTMASTER_DIED)
{
elog(LOG, "LFC: Unexpected status of prewarm worker termination: %d", status);
}
}
PG_CATCH();
{
elog(LOG, "LFC: cancel prewarm");
lfc_ctl->prewarm_canceled = true;
interrupted = true;
}
PG_END_TRY();
} while (interrupted);
if (!lfc_ctl->prewarm_workers[i].completed)
{
/* Background worker doesn't set completion time: it means that it was abnormally terminated */
elog(LOG, "LFC: prewarm worker %d failed", i+1);
/* Set completion time to prevent get_prewarm_info from considering this worker as active */
lfc_ctl->prewarm_workers[i].completed = GetCurrentTimestamp();
}
}
dsm_detach(seg);
LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
lfc_ctl->prewarm_active = false;
LWLockRelease(lfc_lock);
}
void
lfc_prewarm_main(Datum main_arg)
{
size_t snd_idx = 0, rcv_idx = 0;
size_t n_sent = 0, n_received = 0;
size_t fcs_chunk_size_log;
size_t max_prefetch_pages;
size_t prewarm_batch;
size_t n_workers;
dsm_segment *seg;
FileCacheState* fcs;
uint8* bitmap;
BufferTag tag;
PrewarmWorkerState* ws;
uint32 worker_id = DatumGetInt32(main_arg);
AmPrewarmWorker = true;
pqsignal(SIGTERM, die);
BackgroundWorkerUnblockSignals();
seg = dsm_attach(lfc_ctl->prewarm_lfc_state_handle);
if (seg == NULL)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("could not map dynamic shared memory segment")));
fcs = (FileCacheState*) dsm_segment_address(seg);
prewarm_batch = lfc_ctl->prewarm_batch;
fcs_chunk_size_log = fcs->chunk_size_log;
n_workers = lfc_ctl->n_prewarm_workers;
max_prefetch_pages = lfc_ctl->n_prewarm_entries << fcs_chunk_size_log;
ws = &lfc_ctl->prewarm_workers[worker_id];
bitmap = FILE_CACHE_STATE_BITMAP(fcs);
/* enable prefetch in LFC */
lfc_store_prefetch_result = true;
lfc_do_prewarm = true; /* Flag for lfc_prefetch preventing replacement of existed entries if LFC cache is full */
elog(LOG, "LFC: worker %d start prewarming", worker_id);
while (!lfc_ctl->prewarm_canceled)
{
if (snd_idx < max_prefetch_pages)
{
if ((snd_idx >> fcs_chunk_size_log) % n_workers != worker_id)
{
/* If there are multiple workers, split chunks between them */
snd_idx += 1 << fcs_chunk_size_log;
}
else
{
if (BITMAP_ISSET(bitmap, snd_idx))
{
tag = fcs->chunks[snd_idx >> fcs_chunk_size_log];
tag.blockNum += snd_idx & ((1 << fcs_chunk_size_log) - 1);
if (!lfc_cache_contains(BufTagGetNRelFileInfo(tag), tag.forkNum, tag.blockNum))
{
(void)communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
n_sent += 1;
}
else
{
ws->skipped_pages += 1;
BITMAP_CLR(bitmap, snd_idx);
}
}
snd_idx += 1;
}
}
if (n_sent >= n_received + prewarm_batch || snd_idx == max_prefetch_pages)
{
if (n_received == n_sent && snd_idx == max_prefetch_pages)
{
break;
}
if ((rcv_idx >> fcs_chunk_size_log) % n_workers != worker_id)
{
/* Skip chunks processed by other workers */
rcv_idx += 1 << fcs_chunk_size_log;
continue;
}
/* Locate next block to prefetch */
while (!BITMAP_ISSET(bitmap, rcv_idx))
{
rcv_idx += 1;
}
tag = fcs->chunks[rcv_idx >> fcs_chunk_size_log];
tag.blockNum += rcv_idx & ((1 << fcs_chunk_size_log) - 1);
if (communicator_prefetch_receive(tag))
{
ws->prewarmed_pages += 1;
}
else
{
ws->skipped_pages += 1;
}
rcv_idx += 1;
n_received += 1;
}
}
/* No need to perform prefetch cleanup here because prewarm worker will be terminated and
* connection to PS dropped just after return from this function.
*/
Assert(n_sent == n_received || lfc_ctl->prewarm_canceled);
elog(LOG, "LFC: worker %d complete prewarming: loaded %ld pages", worker_id, (long)n_received);
lfc_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp();
}
void
lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
{
@@ -649,8 +954,6 @@ lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
FileCacheEntry *entry;
uint32 hash;
Assert(!neon_use_communicator_worker);
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return;
@@ -696,8 +999,6 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
bool found = false;
uint32 hash;
Assert(!neon_use_communicator_worker);
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return false;
@@ -733,8 +1034,6 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
uint32 hash;
int i = 0;
Assert(!neon_use_communicator_worker);
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return 0;
@@ -842,8 +1141,6 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
int blocks_read = 0;
int buf_offset = 0;
Assert(!neon_use_communicator_worker);
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return -1;
@@ -1154,7 +1451,7 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
/* Can't add this chunk - we don't have the space for it */
hash_search_with_hash_value(lfc_hash, &entry->key, hash,
HASH_REMOVE, NULL);
lfc_prewarm_cancel = true; /* cancel prewarm if LFC limit is reached */
lfc_ctl->prewarm_canceled = true; /* cancel prewarm if LFC limit is reached */
return false;
}
@@ -1209,8 +1506,6 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
int chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
Assert(!neon_use_communicator_worker);
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return false;
@@ -1356,8 +1651,6 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
uint32 entry_offset;
int buf_offset = 0;
Assert(!neon_use_communicator_worker);
if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */
return;
@@ -1534,44 +1827,128 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
LWLockRelease(lfc_lock);
}
/*
* Return an array of LfcStatsEntrys, terminated by an entry with NULL name
*/
LfcStatsEntry *
get_lfc_stats(void)
typedef struct
{
LfcStatsEntry *entries;
int i = 0;
TupleDesc tupdesc;
} NeonGetStatsCtx;
#define NUM_ENTRIES 10
entries = palloc(sizeof(LfcStatsEntry) * (NUM_ENTRIES + 1));
#define NUM_NEON_GET_STATS_COLS 2
entries[i++] = (LfcStatsEntry) {"file_cache_chunk_size_pages", lfc_ctl == NULL,
lfc_ctl ? lfc_blocks_per_chunk : 0 };
entries[i++] = (LfcStatsEntry) {"file_cache_misses", lfc_ctl == NULL,
lfc_ctl ? lfc_ctl->misses : 0};
entries[i++] = (LfcStatsEntry) {"file_cache_hits", lfc_ctl == NULL,
lfc_ctl ? lfc_ctl->hits : 0 };
entries[i++] = (LfcStatsEntry) {"file_cache_used", lfc_ctl == NULL,
lfc_ctl ? lfc_ctl->used : 0 };
entries[i++] = (LfcStatsEntry) {"file_cache_writes", lfc_ctl == NULL,
lfc_ctl ? lfc_ctl->writes : 0 };
entries[i++] = (LfcStatsEntry) {"file_cache_size", lfc_ctl == NULL,
lfc_ctl ? lfc_ctl->size : 0 };
entries[i++] = (LfcStatsEntry) {"file_cache_used_pages", lfc_ctl == NULL,
lfc_ctl ? lfc_ctl->used_pages : 0 };
entries[i++] = (LfcStatsEntry) {"file_cache_evicted_pages", lfc_ctl == NULL,
lfc_ctl ? lfc_ctl->evicted_pages : 0 };
entries[i++] = (LfcStatsEntry) {"file_cache_limit", lfc_ctl == NULL,
lfc_ctl ? lfc_ctl->limit : 0 };
entries[i++] = (LfcStatsEntry) {"file_cache_chunks_pinned", lfc_ctl == NULL,
lfc_ctl ? lfc_ctl->pinned : 0 };
entries[i++] = (LfcStatsEntry) { NULL, false, 0 };
Assert(i <= NUM_ENTRIES);
PG_FUNCTION_INFO_V1(neon_get_lfc_stats);
Datum
neon_get_lfc_stats(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
NeonGetStatsCtx *fctx;
MemoryContext oldcontext;
TupleDesc tupledesc;
Datum result;
HeapTuple tuple;
char const *key;
uint64 value = 0;
Datum values[NUM_NEON_GET_STATS_COLS];
bool nulls[NUM_NEON_GET_STATS_COLS];
return entries;
if (SRF_IS_FIRSTCALL())
{
funcctx = SRF_FIRSTCALL_INIT();
/* Switch context when allocating stuff to be used in later calls */
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
/* Create a user function context for cross-call persistence */
fctx = (NeonGetStatsCtx *) palloc(sizeof(NeonGetStatsCtx));
/* Construct a tuple descriptor for the result rows. */
tupledesc = CreateTemplateTupleDesc(NUM_NEON_GET_STATS_COLS);
TupleDescInitEntry(tupledesc, (AttrNumber) 1, "lfc_key",
TEXTOID, -1, 0);
TupleDescInitEntry(tupledesc, (AttrNumber) 2, "lfc_value",
INT8OID, -1, 0);
fctx->tupdesc = BlessTupleDesc(tupledesc);
funcctx->user_fctx = fctx;
/* Return to original context when allocating transient memory */
MemoryContextSwitchTo(oldcontext);
}
funcctx = SRF_PERCALL_SETUP();
/* Get the saved state */
fctx = (NeonGetStatsCtx *) funcctx->user_fctx;
switch (funcctx->call_cntr)
{
case 0:
key = "file_cache_misses";
if (lfc_ctl)
value = lfc_ctl->misses;
break;
case 1:
key = "file_cache_hits";
if (lfc_ctl)
value = lfc_ctl->hits;
break;
case 2:
key = "file_cache_used";
if (lfc_ctl)
value = lfc_ctl->used;
break;
case 3:
key = "file_cache_writes";
if (lfc_ctl)
value = lfc_ctl->writes;
break;
case 4:
key = "file_cache_size";
if (lfc_ctl)
value = lfc_ctl->size;
break;
case 5:
key = "file_cache_used_pages";
if (lfc_ctl)
value = lfc_ctl->used_pages;
break;
case 6:
key = "file_cache_evicted_pages";
if (lfc_ctl)
value = lfc_ctl->evicted_pages;
break;
case 7:
key = "file_cache_limit";
if (lfc_ctl)
value = lfc_ctl->limit;
break;
case 8:
key = "file_cache_chunk_size_pages";
value = lfc_blocks_per_chunk;
break;
case 9:
key = "file_cache_chunks_pinned";
if (lfc_ctl)
value = lfc_ctl->pinned;
break;
default:
SRF_RETURN_DONE(funcctx);
}
values[0] = PointerGetDatum(cstring_to_text(key));
nulls[0] = false;
if (lfc_ctl)
{
nulls[1] = false;
values[1] = Int64GetDatum(value);
}
else
nulls[1] = true;
tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
result = HeapTupleGetDatum(tuple);
SRF_RETURN_NEXT(funcctx, result);
}
/*
* Function returning data from the local file cache
* relation node/tablespace/database/blocknum and access_counter
@@ -1811,3 +2188,83 @@ callback_get_lfc_metrics_unsafe(void)
return result;
}
PG_FUNCTION_INFO_V1(get_local_cache_state);
Datum
get_local_cache_state(PG_FUNCTION_ARGS)
{
size_t max_entries = PG_ARGISNULL(0) ? lfc_prewarm_limit : PG_GETARG_INT32(0);
FileCacheState* fcs = lfc_get_state(max_entries);
if (fcs != NULL)
PG_RETURN_BYTEA_P((bytea*)fcs);
else
PG_RETURN_NULL();
}
PG_FUNCTION_INFO_V1(prewarm_local_cache);
Datum
prewarm_local_cache(PG_FUNCTION_ARGS)
{
bytea* state = PG_GETARG_BYTEA_PP(0);
uint32 n_workers = PG_GETARG_INT32(1);
FileCacheState* fcs = (FileCacheState*)state;
lfc_prewarm(fcs, n_workers);
PG_RETURN_NULL();
}
PG_FUNCTION_INFO_V1(get_prewarm_info);
Datum
get_prewarm_info(PG_FUNCTION_ARGS)
{
Datum values[4];
bool nulls[4];
TupleDesc tupdesc;
uint32 prewarmed_pages = 0;
uint32 skipped_pages = 0;
uint32 active_workers = 0;
uint32 total_pages;
size_t n_workers;
if (lfc_size_limit == 0)
PG_RETURN_NULL();
LWLockAcquire(lfc_lock, LW_SHARED);
if (!lfc_ctl || lfc_ctl->n_prewarm_workers == 0)
{
LWLockRelease(lfc_lock);
PG_RETURN_NULL();
}
n_workers = lfc_ctl->n_prewarm_workers;
total_pages = lfc_ctl->total_prewarm_pages;
for (size_t i = 0; i < n_workers; i++)
{
PrewarmWorkerState* ws = &lfc_ctl->prewarm_workers[i];
prewarmed_pages += ws->prewarmed_pages;
skipped_pages += ws->skipped_pages;
active_workers += ws->completed != 0;
}
LWLockRelease(lfc_lock);
tupdesc = CreateTemplateTupleDesc(4);
TupleDescInitEntry(tupdesc, (AttrNumber) 1, "total_pages", INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "prewarmed_pages", INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 3, "skipped_pages", INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 4, "active_workers", INT4OID, -1, 0);
tupdesc = BlessTupleDesc(tupdesc);
MemSet(nulls, 0, sizeof(nulls));
values[0] = Int32GetDatum(total_pages);
values[1] = Int32GetDatum(prewarmed_pages);
values[2] = Int32GetDatum(skipped_pages);
values[3] = Int32GetDatum(active_workers);
PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
}

View File

@@ -11,19 +11,21 @@
#ifndef FILE_CACHE_h
#define FILE_CACHE_h
#include "lfc_prewarm.h"
#include "neon.h"
#include "neon_pgversioncompat.h"
typedef struct FileCacheState
{
int32 vl_len_; /* varlena header (do not touch directly!) */
uint32 magic;
uint32 n_chunks;
uint32 n_pages;
uint16 chunk_size_log;
BufferTag chunks[FLEXIBLE_ARRAY_MEMBER];
/* followed by bitmap */
} FileCacheState;
/* GUCs */
extern bool lfc_store_prefetch_result;
extern int lfc_max_size;
extern int lfc_size_limit;
extern char *lfc_path;
extern bool lfc_do_prewarm;
extern bool lfc_prewarm_cancel;
/* functions for local file cache */
extern void lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks);
@@ -42,10 +44,11 @@ extern int lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum,
extern void lfc_init(void);
extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
const void* buffer, XLogRecPtr lsn);
extern FileCacheState* lfc_get_state(size_t max_entries);
extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers);
extern int32 lfc_approximate_working_set_size_seconds(time_t duration, bool reset);
extern LfcStatsEntry *get_lfc_stats(void);
static inline bool
lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

View File

@@ -1,653 +0,0 @@
/*-------------------------------------------------------------------------
*
* lfc_prewarm.c
* Functions related to LFC prewarming
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "bitmap.h"
#include "communicator.h"
#include "communicator_new.h"
#include "file_cache.h"
#include "lfc_prewarm.h"
#include "neon.h"
#include "pagestore_client.h"
#include "funcapi.h"
#include "miscadmin.h"
#include "postmaster/bgworker.h"
#include "storage/dsm.h"
#include "tcop/tcopprot.h"
#include "utils/timestamp.h"
#define MAX_PREWARM_WORKERS 8
typedef struct PrewarmWorkerState
{
uint32 prewarmed_pages;
uint32 skipped_pages;
TimestampTz completed;
} PrewarmWorkerState;
typedef struct PrewarmControl
{
/* -1 when not using workers, 0 when no prewarm has been performed */
size_t n_prewarm_workers;
size_t total_prewarm_pages;
bool prewarm_active;
bool prewarm_canceled;
/* These are used in the non-worker mode */
uint32 prewarmed_pages;
uint32 skipped_pages;
TimestampTz completed;
/* These are used with workers */
PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS];
dsm_handle prewarm_lfc_state_handle;
size_t prewarm_batch;
size_t n_prewarm_entries;
} PrewarmControl;
static PrewarmControl *prewarm_ctl;
static int lfc_prewarm_limit;
static int lfc_prewarm_batch;
static LWLockId prewarm_lock;
bool AmPrewarmWorker;
static void lfc_prewarm_with_workers(FileCacheState *fcs, uint32 n_workers);
static void lfc_prewarm_with_async_requests(FileCacheState *fcs);
PGDLLEXPORT void lfc_prewarm_main(Datum main_arg);
void
pg_init_prewarm(void)
{
DefineCustomIntVariable("neon.file_cache_prewarm_limit",
"Maximal number of prewarmed chunks",
NULL,
&lfc_prewarm_limit,
INT_MAX, /* no limit by default */
0,
INT_MAX,
PGC_SIGHUP,
0,
NULL,
NULL,
NULL);
DefineCustomIntVariable("neon.file_cache_prewarm_batch",
"Number of pages retrivied by prewarm from page server",
NULL,
&lfc_prewarm_batch,
64,
1,
INT_MAX,
PGC_SIGHUP,
0,
NULL,
NULL,
NULL);
}
static size_t
PrewarmShmemSize(void)
{
return sizeof(PrewarmControl);
}
void
PrewarmShmemRequest(void)
{
RequestAddinShmemSpace(PrewarmShmemSize());
RequestNamedLWLockTranche("prewarm_lock", 1);
}
void
PrewarmShmemInit(void)
{
bool found;
prewarm_ctl = (PrewarmControl *) ShmemInitStruct("Prewarmer shmem state",
PrewarmShmemSize(),
&found);
if (!found)
{
/* it's zeroed already */
prewarm_lock = (LWLockId) GetNamedLWLockTranche("prewarm_lock");
}
}
static void
validate_fcs(FileCacheState *fcs)
{
size_t fcs_size;
#if 0
size_t fcs_chunk_size_log;
#endif
if (fcs->magic != FILE_CACHE_STATE_MAGIC)
{
elog(ERROR, "LFC: Invalid file cache state magic: %X", fcs->magic);
}
fcs_size = VARSIZE(fcs);
if (FILE_CACHE_STATE_SIZE(fcs) != fcs_size)
{
elog(ERROR, "LFC: Invalid file cache state size: %u vs. %u", (unsigned)FILE_CACHE_STATE_SIZE(fcs), VARSIZE(fcs));
}
/* FIXME */
#if 0
fcs_chunk_size_log = fcs->chunk_size_log;
if (fcs_chunk_size_log > MAX_BLOCKS_PER_CHUNK_LOG)
{
elog(ERROR, "LFC: Invalid chunk size log: %u", fcs->chunk_size_log);
}
#endif
}
/*
* Prewarm LFC cache to the specified state. It uses lfc_prefetch function to
* load prewarmed page without hoilding shared buffer lock and avoid race
* conditions with other backends.
*/
void
lfc_prewarm_with_workers(FileCacheState *fcs, uint32 n_workers)
{
size_t n_entries;
size_t prewarm_batch = Min(lfc_prewarm_batch, readahead_buffer_size);
size_t fcs_size = VARSIZE(fcs);
dsm_segment *seg;
BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS];
Assert(!neon_use_communicator_worker);
if (prewarm_batch == 0 || lfc_prewarm_limit == 0 || n_workers == 0)
{
elog(LOG, "LFC: prewarm is disabled");
return;
}
if (n_workers > MAX_PREWARM_WORKERS)
{
elog(ERROR, "LFC: too many prewarm workers, maximum is %d", MAX_PREWARM_WORKERS);
}
if (fcs == NULL || fcs->n_chunks == 0)
{
elog(LOG, "LFC: nothing to prewarm");
return;
}
n_entries = Min(fcs->n_chunks, lfc_prewarm_limit);
Assert(n_entries != 0);
LWLockAcquire(prewarm_lock, LW_EXCLUSIVE);
/* Do not prewarm more entries than LFC limit */
/* FIXME */
#if 0
if (prewarm_ctl->limit <= prewarm_ctl->size)
{
elog(LOG, "LFC: skip prewarm because LFC is already filled");
LWLockRelease(prewarm_lock);
return;
}
#endif
if (prewarm_ctl->prewarm_active)
{
LWLockRelease(prewarm_lock);
elog(ERROR, "LFC: skip prewarm because another prewarm is still active");
}
prewarm_ctl->n_prewarm_entries = n_entries;
prewarm_ctl->n_prewarm_workers = n_workers;
prewarm_ctl->prewarm_active = true;
prewarm_ctl->prewarm_canceled = false;
prewarm_ctl->prewarm_batch = prewarm_batch;
memset(prewarm_ctl->prewarm_workers, 0, n_workers*sizeof(PrewarmWorkerState));
/* Calculate total number of pages to be prewarmed */
prewarm_ctl->total_prewarm_pages = fcs->n_pages;
LWLockRelease(prewarm_lock);
seg = dsm_create(fcs_size, 0);
memcpy(dsm_segment_address(seg), fcs, fcs_size);
prewarm_ctl->prewarm_lfc_state_handle = dsm_segment_handle(seg);
/* Spawn background workers */
for (uint32 i = 0; i < n_workers; i++)
{
BackgroundWorker worker = {0};
worker.bgw_flags = BGWORKER_SHMEM_ACCESS;
worker.bgw_start_time = BgWorkerStart_ConsistentState;
worker.bgw_restart_time = BGW_NEVER_RESTART;
strcpy(worker.bgw_library_name, "neon");
strcpy(worker.bgw_function_name, "lfc_prewarm_main");
snprintf(worker.bgw_name, BGW_MAXLEN, "LFC prewarm worker %d", i+1);
strcpy(worker.bgw_type, "LFC prewarm worker");
worker.bgw_main_arg = Int32GetDatum(i);
/* must set notify PID to wait for shutdown */
worker.bgw_notify_pid = MyProcPid;
if (!RegisterDynamicBackgroundWorker(&worker, &bgw_handle[i]))
{
ereport(LOG,
(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
errmsg("LFC: registering dynamic bgworker prewarm failed"),
errhint("Consider increasing the configuration parameter \"%s\".", "max_worker_processes")));
n_workers = i;
prewarm_ctl->prewarm_canceled = true;
break;
}
}
for (uint32 i = 0; i < n_workers; i++)
{
bool interrupted;
do
{
interrupted = false;
PG_TRY();
{
BgwHandleStatus status = WaitForBackgroundWorkerShutdown(bgw_handle[i]);
if (status != BGWH_STOPPED && status != BGWH_POSTMASTER_DIED)
{
elog(LOG, "LFC: Unexpected status of prewarm worker termination: %d", status);
}
}
PG_CATCH();
{
elog(LOG, "LFC: cancel prewarm");
prewarm_ctl->prewarm_canceled = true;
interrupted = true;
}
PG_END_TRY();
} while (interrupted);
if (!prewarm_ctl->prewarm_workers[i].completed)
{
/* Background worker doesn't set completion time: it means that it was abnormally terminated */
elog(LOG, "LFC: prewarm worker %d failed", i+1);
/* Set completion time to prevent get_prewarm_info from considering this worker as active */
prewarm_ctl->prewarm_workers[i].completed = GetCurrentTimestamp();
}
}
dsm_detach(seg);
LWLockAcquire(prewarm_lock, LW_EXCLUSIVE);
prewarm_ctl->prewarm_active = false;
LWLockRelease(prewarm_lock);
}
void
lfc_prewarm_main(Datum main_arg)
{
size_t snd_idx = 0, rcv_idx = 0;
size_t n_sent = 0, n_received = 0;
size_t fcs_chunk_size_log;
size_t max_prefetch_pages;
size_t prewarm_batch;
size_t n_workers;
dsm_segment *seg;
FileCacheState* fcs;
uint8* bitmap;
BufferTag tag;
PrewarmWorkerState* ws;
uint32 worker_id = DatumGetInt32(main_arg);
Assert(!neon_use_communicator_worker);
AmPrewarmWorker = true;
pqsignal(SIGTERM, die);
BackgroundWorkerUnblockSignals();
seg = dsm_attach(prewarm_ctl->prewarm_lfc_state_handle);
if (seg == NULL)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
errmsg("could not map dynamic shared memory segment")));
fcs = (FileCacheState*) dsm_segment_address(seg);
prewarm_batch = prewarm_ctl->prewarm_batch;
fcs_chunk_size_log = fcs->chunk_size_log;
n_workers = prewarm_ctl->n_prewarm_workers;
max_prefetch_pages = prewarm_ctl->n_prewarm_entries << fcs_chunk_size_log;
ws = &prewarm_ctl->prewarm_workers[worker_id];
bitmap = FILE_CACHE_STATE_BITMAP(fcs);
/* enable prefetch in LFC */
lfc_store_prefetch_result = true;
lfc_do_prewarm = true; /* Flag for lfc_prefetch preventing replacement of existed entries if LFC cache is full */
elog(LOG, "LFC: worker %d start prewarming", worker_id);
while (!prewarm_ctl->prewarm_canceled)
{
if (snd_idx < max_prefetch_pages)
{
if ((snd_idx >> fcs_chunk_size_log) % n_workers != worker_id)
{
/* If there are multiple workers, split chunks between them */
snd_idx += 1 << fcs_chunk_size_log;
}
else
{
if (BITMAP_ISSET(bitmap, snd_idx))
{
tag = fcs->chunks[snd_idx >> fcs_chunk_size_log];
tag.blockNum += snd_idx & ((1 << fcs_chunk_size_log) - 1);
if (!lfc_cache_contains(BufTagGetNRelFileInfo(tag), tag.forkNum, tag.blockNum))
{
(void) communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
n_sent += 1;
}
else
{
ws->skipped_pages += 1;
BITMAP_CLR(bitmap, snd_idx);
}
}
snd_idx += 1;
}
}
if (n_sent >= n_received + prewarm_batch || snd_idx == max_prefetch_pages)
{
if (n_received == n_sent && snd_idx == max_prefetch_pages)
{
break;
}
if ((rcv_idx >> fcs_chunk_size_log) % n_workers != worker_id)
{
/* Skip chunks processed by other workers */
rcv_idx += 1 << fcs_chunk_size_log;
continue;
}
/* Locate next block to prefetch */
while (!BITMAP_ISSET(bitmap, rcv_idx))
{
rcv_idx += 1;
}
tag = fcs->chunks[rcv_idx >> fcs_chunk_size_log];
tag.blockNum += rcv_idx & ((1 << fcs_chunk_size_log) - 1);
if (communicator_prefetch_receive(tag))
{
ws->prewarmed_pages += 1;
}
else
{
ws->skipped_pages += 1;
}
rcv_idx += 1;
n_received += 1;
}
}
/* No need to perform prefetch cleanup here because prewarm worker will be terminated and
* connection to PS dropped just after return from this function.
*/
Assert(n_sent == n_received || prewarm_ctl->prewarm_canceled);
elog(LOG, "LFC: worker %d complete prewarming: loaded %ld pages", worker_id, (long)n_received);
prewarm_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp();
}
/*
* Prewarm LFC cache to the specified state. Uses the new communicator
*
* FIXME: Is there a race condition because we're not holding Postgres
* buffer manager locks?
*/
static void
lfc_prewarm_with_async_requests(FileCacheState *fcs)
{
size_t n_entries;
uint8 *bitmap;
uint64 bitno;
int blocks_per_chunk;
Assert(neon_use_communicator_worker);
if (lfc_prewarm_limit == 0)
{
elog(LOG, "LFC: prewarm is disabled");
return;
}
if (fcs == NULL || fcs->n_chunks == 0)
{
elog(LOG, "LFC: nothing to prewarm");
return;
}
n_entries = Min(fcs->n_chunks, lfc_prewarm_limit);
Assert(n_entries != 0);
LWLockAcquire(prewarm_lock, LW_EXCLUSIVE);
/* Do not prewarm more entries than LFC limit */
/* FIXME */
#if 0
if (prewarm_ctl->limit <= prewarm_ctl->size)
{
elog(LOG, "LFC: skip prewarm because LFC is already filled");
LWLockRelease(prewarm_lock);
return;
}
#endif
if (prewarm_ctl->prewarm_active)
{
LWLockRelease(prewarm_lock);
elog(ERROR, "LFC: skip prewarm because another prewarm is still active");
}
prewarm_ctl->n_prewarm_entries = n_entries;
prewarm_ctl->n_prewarm_workers = -1;
prewarm_ctl->prewarm_active = true;
prewarm_ctl->prewarm_canceled = false;
/* Calculate total number of pages to be prewarmed */
prewarm_ctl->total_prewarm_pages = fcs->n_pages;
LWLockRelease(prewarm_lock);
elog(LOG, "LFC: start prewarming");
lfc_do_prewarm = true;
lfc_prewarm_cancel = false;
bitmap = FILE_CACHE_STATE_BITMAP(fcs);
blocks_per_chunk = 1 << fcs->chunk_size_log;
bitno = 0;
for (uint32 chunkno = 0; chunkno < fcs->n_chunks; chunkno++)
{
BufferTag *chunk_tag = &fcs->chunks[chunkno];
BlockNumber request_startblkno = InvalidBlockNumber;
BlockNumber request_endblkno;
if (lfc_prewarm_cancel)
{
prewarm_ctl->prewarm_canceled = true;
break;
}
/* take next chunk */
for (int j = 0; j < blocks_per_chunk; j++)
{
BlockNumber blkno = chunk_tag->blockNum + j;
if (BITMAP_ISSET(bitmap, bitno))
{
if (request_startblkno != InvalidBlockNumber)
{
if (request_endblkno == blkno)
{
/* append this block to the request */
request_endblkno++;
}
else
{
/* flush this request, and start new one */
communicator_new_prefetch_register_bufferv(
BufTagGetNRelFileInfo(*chunk_tag),
chunk_tag->forkNum,
request_startblkno,
request_endblkno - request_startblkno
);
request_startblkno = blkno;
request_endblkno = blkno + 1;
}
}
else
{
/* flush this request, if any, and start new one */
if (request_startblkno != InvalidBlockNumber)
{
communicator_new_prefetch_register_bufferv(
BufTagGetNRelFileInfo(*chunk_tag),
chunk_tag->forkNum,
request_startblkno,
request_endblkno - request_startblkno
);
}
request_startblkno = blkno;
request_endblkno = blkno + 1;
}
prewarm_ctl->prewarmed_pages += 1;
}
bitno++;
}
/* flush this request */
communicator_new_prefetch_register_bufferv(
BufTagGetNRelFileInfo(*chunk_tag),
chunk_tag->forkNum,
request_startblkno,
request_endblkno - request_startblkno
);
request_startblkno = request_endblkno = InvalidBlockNumber;
}
elog(LOG, "LFC: complete prewarming: loaded %lu pages", (unsigned long) prewarm_ctl->prewarmed_pages);
prewarm_ctl->completed = GetCurrentTimestamp();
LWLockAcquire(prewarm_lock, LW_EXCLUSIVE);
prewarm_ctl->prewarm_active = false;
LWLockRelease(prewarm_lock);
}
PG_FUNCTION_INFO_V1(get_local_cache_state);
Datum
get_local_cache_state(PG_FUNCTION_ARGS)
{
size_t max_entries = PG_ARGISNULL(0) ? lfc_prewarm_limit : PG_GETARG_INT32(0);
FileCacheState* fcs;
if (neon_use_communicator_worker)
fcs = communicator_new_get_lfc_state(max_entries);
else
fcs = lfc_get_state(max_entries);
if (fcs != NULL)
PG_RETURN_BYTEA_P((bytea*)fcs);
else
PG_RETURN_NULL();
}
PG_FUNCTION_INFO_V1(prewarm_local_cache);
Datum
prewarm_local_cache(PG_FUNCTION_ARGS)
{
bytea* state = PG_GETARG_BYTEA_PP(0);
uint32 n_workers = PG_GETARG_INT32(1);
FileCacheState* fcs;
fcs = (FileCacheState *)state;
validate_fcs(fcs);
if (neon_use_communicator_worker)
lfc_prewarm_with_async_requests(fcs);
else
lfc_prewarm_with_workers(fcs, n_workers);
PG_RETURN_NULL();
}
PG_FUNCTION_INFO_V1(get_prewarm_info);
Datum
get_prewarm_info(PG_FUNCTION_ARGS)
{
Datum values[4];
bool nulls[4];
TupleDesc tupdesc;
uint32 prewarmed_pages = 0;
uint32 skipped_pages = 0;
uint32 active_workers = 0;
uint32 total_pages;
if (lfc_size_limit == 0)
PG_RETURN_NULL();
LWLockAcquire(prewarm_lock, LW_SHARED);
if (!prewarm_ctl || prewarm_ctl->n_prewarm_workers == 0)
{
LWLockRelease(prewarm_lock);
PG_RETURN_NULL();
}
if (prewarm_ctl->n_prewarm_workers == -1)
{
total_pages = prewarm_ctl->total_prewarm_pages;
prewarmed_pages = prewarm_ctl->prewarmed_pages;
skipped_pages = prewarm_ctl->prewarmed_pages;
active_workers = 1;
}
else
{
size_t n_workers;
n_workers = prewarm_ctl->n_prewarm_workers;
total_pages = prewarm_ctl->total_prewarm_pages;
for (size_t i = 0; i < n_workers; i++)
{
PrewarmWorkerState *ws = &prewarm_ctl->prewarm_workers[i];
prewarmed_pages += ws->prewarmed_pages;
skipped_pages += ws->skipped_pages;
active_workers += ws->completed != 0;
}
}
LWLockRelease(prewarm_lock);
tupdesc = CreateTemplateTupleDesc(4);
TupleDescInitEntry(tupdesc, (AttrNumber) 1, "total_pages", INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "prewarmed_pages", INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 3, "skipped_pages", INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 4, "active_workers", INT4OID, -1, 0);
tupdesc = BlessTupleDesc(tupdesc);
MemSet(nulls, 0, sizeof(nulls));
values[0] = Int32GetDatum(total_pages);
values[1] = Int32GetDatum(prewarmed_pages);
values[2] = Int32GetDatum(skipped_pages);
values[3] = Int32GetDatum(active_workers);
PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
}

View File

@@ -1,39 +0,0 @@
/*-------------------------------------------------------------------------
*
* lfc_prewarm.h
* Local File Cache prewarmer
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*-------------------------------------------------------------------------
*/
#ifndef LFC_PREWARM_H
#define LFC_PREWARM_H
#include "storage/buf_internals.h"
typedef struct FileCacheState
{
int32 vl_len_; /* varlena header (do not touch directly!) */
uint32 magic;
uint32 n_chunks;
uint32 n_pages;
uint16 chunk_size_log;
BufferTag chunks[FLEXIBLE_ARRAY_MEMBER];
/* followed by bitmap */
} FileCacheState;
#define FILE_CACHE_STATE_MAGIC 0xfcfcfcfc
#define FILE_CACHE_STATE_BITMAP(fcs) ((uint8*)&(fcs)->chunks[(fcs)->n_chunks])
#define FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_chunks, blocks_per_chunk) (sizeof(FileCacheState) + (n_chunks)*sizeof(BufferTag) + (((n_chunks) * blocks_per_chunk)+7)/8)
#define FILE_CACHE_STATE_SIZE(fcs) (sizeof(FileCacheState) + (fcs->n_chunks)*sizeof(BufferTag) + (((fcs->n_chunks) << fcs->chunk_size_log)+7)/8)
extern void pg_init_prewarm(void);
extern void PrewarmShmemRequest(void);
extern void PrewarmShmemInit(void);
#endif /* LFC_PREWARM_H */

View File

@@ -69,8 +69,7 @@ char *neon_project_id;
char *neon_branch_id;
char *neon_endpoint_id;
int32 max_cluster_size;
char *pageserver_connstring;
char *pageserver_grpc_urls;
char *page_server_connstring;
char *neon_auth_token;
int readahead_buffer_size = 128;
@@ -80,13 +79,20 @@ int neon_protocol_version = 3;
static int neon_compute_mode = 0;
static int max_reconnect_attempts = 60;
int neon_stripe_size;
static int stripe_size;
static int max_sockets;
static int pageserver_response_log_timeout = 10000;
/* 2.5 minutes. A bit higher than highest default TCP retransmission timeout */
static int pageserver_response_disconnect_timeout = 150000;
typedef struct
{
char connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE];
size_t num_shards;
size_t stripe_size;
} ShardMap;
/*
* PagestoreShmemState is kept in shared memory. It contains the connection
* strings for each shard.
@@ -124,7 +130,7 @@ static uint64 pagestore_local_counter = 0;
typedef enum PSConnectionState {
PS_Disconnected, /* no connection yet */
PS_Connecting_Startup, /* connection starting up */
PS_Connecting_PageStream, /* negotiating pagestream */
PS_Connecting_PageStream, /* negotiating pagestream */
PS_Connected, /* connected, pagestream established */
} PSConnectionState;
@@ -173,8 +179,6 @@ static bool pageserver_flush(shardno_t shard_no);
static void pageserver_disconnect(shardno_t shard_no);
static void pageserver_disconnect_shard(shardno_t shard_no);
static void AssignShardMap(const char *newval);
static bool
PagestoreShmemIsValid(void)
{
@@ -188,8 +192,8 @@ PagestoreShmemIsValid(void)
* not valid, returns false. The contents of *result are undefined in
* that case, and must not be relied on.
*/
bool
parse_shard_map(const char *connstr, ShardMap *result)
static bool
ParseShardMap(const char *connstr, ShardMap *result)
{
const char *p;
int nshards = 0;
@@ -234,31 +238,24 @@ parse_shard_map(const char *connstr, ShardMap *result)
if (result)
{
result->num_shards = nshards;
result->stripe_size = neon_stripe_size;
result->stripe_size = stripe_size;
}
return true;
}
/* GUC hooks for neon.pageserver_connstring */
static bool
CheckPageserverConnstring(char **newval, void **extra, GucSource source)
{
char *p = *newval;
return parse_shard_map(p, NULL);
return ParseShardMap(p, NULL);
}
static void
AssignPageserverConnstring(const char *newval, void *extra)
{
/*
* 'neon.pageserver_connstring' is ignored if the new communicator is used.
* In that case, the shard map is loaded from 'neon.pageserver_grpc_urls'
* instead, and that happens in the communicator process only.
*/
if (neon_use_communicator_worker)
return;
ShardMap shard_map;
/*
* Only postmaster updates the copy in shared memory.
@@ -266,29 +263,11 @@ AssignPageserverConnstring(const char *newval, void *extra)
if (!PagestoreShmemIsValid() || IsUnderPostmaster)
return;
AssignShardMap(newval);
}
/* GUC hooks for neon.pageserver_connstring */
static bool
CheckPageserverGrpcUrls(char **newval, void **extra, GucSource source)
{
char *p = *newval;
return parse_shard_map(p, NULL);
}
static void
AssignShardMap(const char *newval)
{
ShardMap shard_map;
if (!parse_shard_map(newval, &shard_map))
if (!ParseShardMap(newval, &shard_map))
{
/*
* shouldn't happen, because we already checked the value in
* CheckPageserverConnstring/CheckPageserverGrpcUrls
* CheckPageserverConnstring
*/
elog(ERROR, "could not parse shard map");
}
@@ -394,17 +373,17 @@ get_shard_number(BufferTag *tag)
#if PG_MAJORVERSION_NUM < 16
hash = murmurhash32(tag->rnode.relNode);
hash = hash_combine(hash, murmurhash32(tag->blockNum / neon_stripe_size));
hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
#else
hash = murmurhash32(tag->relNumber);
hash = hash_combine(hash, murmurhash32(tag->blockNum / neon_stripe_size));
hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
#endif
return hash % n_shards;
}
static inline void
CLEANUP_AND_DISCONNECT(PageServer *shard)
CLEANUP_AND_DISCONNECT(PageServer *shard)
{
if (shard->wes_read)
{
@@ -426,7 +405,7 @@ CLEANUP_AND_DISCONNECT(PageServer *shard)
* complete the connection (e.g. due to receiving an earlier cancellation
* during connection start).
* Returns true if successfully connected; false if the connection failed.
*
*
* Throws errors in unrecoverable situations, or when this backend's query
* is canceled.
*/
@@ -1329,7 +1308,7 @@ PagestoreShmemInit(void)
pg_atomic_init_u64(&pagestore_shared->begin_update_counter, 0);
pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0);
memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
AssignPageserverConnstring(pageserver_connstring, NULL);
AssignPageserverConnstring(page_server_connstring, NULL);
}
}
@@ -1348,21 +1327,12 @@ pg_init_libpagestore(void)
DefineCustomStringVariable("neon.pageserver_connstring",
"connection string to the page server",
NULL,
&pageserver_connstring,
&page_server_connstring,
"",
PGC_SIGHUP,
0, /* no flags required */
CheckPageserverConnstring, AssignPageserverConnstring, NULL);
DefineCustomStringVariable("neon.pageserver_grpc_urls",
"list of gRPC URLs for the page servers",
NULL,
&pageserver_grpc_urls,
"",
PGC_SIGHUP,
0, /* no flags required */
CheckPageserverGrpcUrls, NULL, NULL);
DefineCustomStringVariable("neon.timeline_id",
"Neon timeline_id the server is running on",
NULL,
@@ -1409,7 +1379,7 @@ pg_init_libpagestore(void)
DefineCustomIntVariable("neon.stripe_size",
"sharding stripe size",
NULL,
&neon_stripe_size,
&stripe_size,
2048, 1, INT_MAX,
PGC_SIGHUP,
GUC_UNIT_BLOCKS,
@@ -1518,7 +1488,7 @@ pg_init_libpagestore(void)
if (neon_auth_token)
neon_log(LOG, "using storage auth token from NEON_AUTH_TOKEN environment variable");
if (pageserver_connstring[0] || pageserver_grpc_urls[0])
if (page_server_connstring && page_server_connstring[0])
{
neon_log(PageStoreTrace, "set neon_smgr hook");
smgr_hook = smgr_neon;

View File

@@ -21,7 +21,6 @@
#include "replication/logicallauncher.h"
#include "replication/slot.h"
#include "replication/walsender.h"
#include "storage/ipc.h"
#include "storage/proc.h"
#include "storage/ipc.h"
#include "funcapi.h"
@@ -32,7 +31,6 @@
#include "utils/guc_tables.h"
#include "communicator.h"
#include "communicator_new.h"
#include "communicator_process.h"
#include "extension_server.h"
#include "file_cache.h"
@@ -457,16 +455,6 @@ _PG_init(void)
load_file("$libdir/neon_rmgr", false);
#endif
DefineCustomBoolVariable(
"neon.use_communicator_worker",
"Uses the communicator worker implementation",
NULL,
&neon_use_communicator_worker,
true,
PGC_POSTMASTER,
0,
NULL, NULL, NULL);
/*
* Initializing a pre-loaded Postgres extension happens in three stages:
*
@@ -501,14 +489,12 @@ _PG_init(void)
pg_init_libpagestore();
relsize_hash_init();
lfc_init();
pg_init_prewarm();
pg_init_walproposer();
init_lwlsncache();
pg_init_communicator_process();
pg_init_communicator();
Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
InitUnstableExtensionsSupport();
@@ -687,10 +673,7 @@ approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
duration = PG_ARGISNULL(0) ? (time_t) -1 : PG_GETARG_INT32(0);
if (neon_use_communicator_worker)
dc = communicator_new_approximate_working_set_size_seconds(duration, false);
else
dc = lfc_approximate_working_set_size_seconds(duration, false);
dc = lfc_approximate_working_set_size_seconds(duration, false);
if (dc < 0)
PG_RETURN_NULL();
else
@@ -703,49 +686,13 @@ approximate_working_set_size(PG_FUNCTION_ARGS)
bool reset = PG_GETARG_BOOL(0);
int32 dc;
if (neon_use_communicator_worker)
dc = communicator_new_approximate_working_set_size_seconds(-1, reset);
else
dc = lfc_approximate_working_set_size_seconds(-1, reset);
dc = lfc_approximate_working_set_size_seconds(-1, reset);
if (dc < 0)
PG_RETURN_NULL();
else
PG_RETURN_INT32(dc);
}
PG_FUNCTION_INFO_V1(neon_get_lfc_stats);
Datum
neon_get_lfc_stats(PG_FUNCTION_ARGS)
{
#define NUM_NEON_GET_STATS_COLS 2
ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
LfcStatsEntry *entries;
LfcStatsEntry *entry;
InitMaterializedSRF(fcinfo, 0);
if (neon_use_communicator_worker)
entries = communicator_new_get_lfc_stats();
else
entries = get_lfc_stats();
entry = entries;
while (entry->metric_name != NULL)
{
Datum values[NUM_NEON_GET_STATS_COLS];
bool nulls[NUM_NEON_GET_STATS_COLS];
values[0] = CStringGetTextDatum(entry->metric_name);
nulls[1] = entry->isnull;
values[1] = Int64GetDatum(entry->isnull ? 0 : entry->value);
tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
entry++;
}
PG_RETURN_VOID();
}
/*
* Initialization stage 2: make requests for the amount of shared memory we
* will need.
@@ -761,13 +708,11 @@ neon_shmem_request_hook(void)
#endif
LfcShmemRequest();
PrewarmShmemRequest();
NeonPerfCountersShmemRequest();
PagestoreShmemRequest();
RelsizeCacheShmemRequest();
WalproposerShmemRequest();
LwLsnCacheShmemRequest();
CommunicatorNewShmemRequest();
}
@@ -786,13 +731,11 @@ neon_shmem_startup_hook(void)
LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
LfcShmemInit();
PrewarmShmemInit();
NeonPerfCountersShmemInit();
PagestoreShmemInit();
RelsizeCacheShmemInit();
WalproposerShmemInit();
LwLsnCacheShmemInit();
CommunicatorNewShmemInit();
#if PG_MAJORVERSION_NUM >= 17
WAIT_EVENT_NEON_LFC_MAINTENANCE = WaitEventExtensionNew("Neon/FileCache_Maintenance");

View File

@@ -84,10 +84,5 @@ extern void WalproposerShmemInit(void);
extern void LwLsnCacheShmemInit(void);
extern void NeonPerfCountersShmemInit(void);
typedef struct LfcStatsEntry {
const char *metric_name;
bool isnull;
uint64 value;
} LfcStatsEntry;
#endif /* NEON_H */

View File

@@ -9,10 +9,6 @@
#include "fmgr.h"
#include "storage/buf_internals.h"
#if PG_MAJORVERSION_NUM < 16
typedef PGAlignedBlock PGIOAlignedBlock;
#endif
#if PG_MAJORVERSION_NUM < 17
#define NRelFileInfoBackendIsTemp(rinfo) (rinfo.backend != InvalidBackendId)
#else
@@ -76,21 +72,22 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
(tag).rnode = (rinfo); \
} while (false)
#define BufTagGetNRelFileInfo(tag) (tag).rnode
#define BufTagGetNRelFileInfo(tag) tag.rnode
#define BufTagGetRelNumber(tagp) ((tagp)->rnode.relNode)
#define BufTagInit(tag, rel_number, fork_number, block_number, spc_oid, db_oid) \
#define BufTagInit(tag, relNumber, forknum, blkno, spcOid, dbOid) \
do { \
RelFileNode rnode = { .spcNode = (spc_oid), .dbNode = (db_oid), .relNode = (rel_number)}; \
(tag).forkNum = (fork_number); \
(tag).blockNum = (block_number); \
(tag).rnode = rnode; \
RelFileNode rnode = { .spcNode = spcOid, .dbNode = dbOid, .relNode = relNumber}; \
(tag).forkNum = forknum; \
(tag).blockNum = blkno; \
(tag).rnode = rnode; \
} while (false)
#define InvalidRelFileNumber InvalidOid
#define SMgrRelGetRelInfo(reln) ((reln)->smgr_rnode.node)
#define SMgrRelGetRelInfo(reln) \
(reln->smgr_rnode.node)
#define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers
@@ -136,16 +133,17 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
.relNumber = (tag).relNumber, \
})
#define BufTagInit(tag, rel_number, fork_number, block_number, spc_oid, db_oid) \
#define BufTagInit(tag, relNumber, forknum, blkno, spcOid, dbOid) \
do { \
(tag).forkNum = (fork_number); \
(tag).blockNum = (block_number); \
(tag).spcOid = (spc_oid); \
(tag).dbOid = (db_oid); \
(tag).relNumber = (rel_number); \
(tag).forkNum = forknum; \
(tag).blockNum = blkno; \
(tag).spcOid = spcOid; \
(tag).dbOid = dbOid; \
(tag).relNumber = relNumber; \
} while (false)
#define SMgrRelGetRelInfo(reln) ((reln)->smgr_rlocator)
#define SMgrRelGetRelInfo(reln) \
((reln)->smgr_rlocator)
#define DropRelationAllLocalBuffers DropRelationAllLocalBuffers
#endif
@@ -162,10 +160,6 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
#define AmAutoVacuumWorkerProcess() (IsAutoVacuumWorkerProcess())
#endif
#if PG_MAJORVERSION_NUM < 17
#define MyProcNumber (MyProc - &ProcGlobal->allProcs[0])
#endif
#if PG_MAJORVERSION_NUM < 15
extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags);
extern TimeLineID GetWALInsertionTimeLine(void);

View File

@@ -236,24 +236,14 @@ extern void prefetch_on_ps_disconnect(void);
extern page_server_api *page_server;
extern char *pageserver_connstring;
extern char *pageserver_grpc_urls;
extern char *page_server_connstring;
extern int flush_every_n_requests;
extern int readahead_buffer_size;
extern char *neon_timeline;
extern char *neon_tenant;
extern int32 max_cluster_size;
extern int neon_protocol_version;
extern int neon_stripe_size;
typedef struct
{
char connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE];
size_t num_shards;
size_t stripe_size;
} ShardMap;
extern bool parse_shard_map(const char *connstr, ShardMap *result);
extern shardno_t get_shard_number(BufferTag* tag);
extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo);
@@ -300,7 +290,6 @@ extern int64 neon_dbsize(Oid dbNode);
extern void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum,
BlockNumber blkno, neon_request_lsns *output,
BlockNumber nblocks);
extern XLogRecPtr neon_get_write_lsn(void);
/* utils for neon relsize cache */
extern void relsize_hash_init(void);

View File

@@ -62,7 +62,6 @@
#include "bitmap.h"
#include "communicator.h"
#include "communicator_new.h"
#include "file_cache.h"
#include "neon.h"
#include "neon_lwlsncache.h"
@@ -88,7 +87,7 @@ static char *hexdump_page(char *page);
NInfoGetRelNumber(InfoFromSMgrRel(reln)) >= FirstNormalObjectId \
)
const int SmgrTrace = DEBUG1;
const int SmgrTrace = DEBUG5;
/* unlogged relation build states */
typedef enum
@@ -502,60 +501,6 @@ nm_adjust_lsn(XLogRecPtr lsn)
return lsn;
}
/*
* Get a LSN to use to stamp an operation like relation create or truncate.
* On operations on individual pages we use the LSN of the page, but when
* e.g. smgrcreate() is called, we have to do something else.
*/
XLogRecPtr
neon_get_write_lsn(void)
{
XLogRecPtr lsn;
if (RecoveryInProgress())
{
/*
* FIXME: v14 doesn't have GetCurrentReplayRecPtr(). Options:
* - add it in our fork
* - store a magic value that means that you must use
* current latest possible LSN at the time that the request
* on this thing is made again (or some other recent enough
* lsn).
*/
#if PG_VERSION_NUM >= 150000
lsn = GetCurrentReplayRecPtr(NULL);
#else
lsn = GetXLogReplayRecPtr(NULL); /* FIXME: this is wrong, see above */
#endif
}
else
lsn = GetXLogInsertRecPtr();
/*
* If the insert LSN points to just after page header, round it down to
* the beginning of the page, because the page header might not have been
* inserted to the WAL yet, and if we tried to flush it, the WAL flushing
* code gets upset.
*/
{
int segoff;
segoff = XLogSegmentOffset(lsn, wal_segment_size);
if (segoff == SizeOfXLogLongPHD)
{
lsn = lsn - segoff;
}
else
{
int offset = lsn % XLOG_BLCKSZ;
if (offset == SizeOfXLogShortPHD)
lsn = lsn - offset;
}
}
return lsn;
}
/*
* Return LSN for requesting pages and number of blocks from page server
@@ -799,6 +744,11 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks))
{
return true;
}
/*
* \d+ on a view calls smgrexists with 0/0/0 relfilenode. The page server
* will error out if you check that, because the whole dbdir for
@@ -822,20 +772,10 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
return false;
}
if (neon_use_communicator_worker)
return communicator_new_rel_exists(InfoFromSMgrRel(reln), forkNum);
else
{
if (get_cached_relsize(InfoFromSMgrRel(reln), forkNum, &n_blocks))
{
return true;
}
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns);
}
return communicator_exists(InfoFromSMgrRel(reln), forkNum, &request_lsns);
}
/*
@@ -893,36 +833,16 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
* relation. Currently, we don't call SetLastWrittenLSN() when a new
* relation created, so if we didn't remember the size in the relsize
* cache, we might call smgrnblocks() on the newly-created relation before
* the creation WAL record has been received by the page server.
*
* XXX: with the new communicator, similar considerations apply. However,
* during replay, neon_get_write_lsn() returns the (end-)LSN of the record
* that's being replayed, so we should not have the correctness issue
* mentioned in previous paragraph.
* the creation WAL record hass been received by the page server.
*/
if (neon_use_communicator_worker)
if (isRedo)
{
XLogRecPtr lsn = neon_get_write_lsn();
if (isRedo)
{
if (!communicator_new_rel_exists(InfoFromSMgrRel(reln), forkNum))
communicator_new_rel_create(InfoFromSMgrRel(reln), forkNum, lsn);
}
else
communicator_new_rel_create(InfoFromSMgrRel(reln), forkNum, lsn);
update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
get_cached_relsize(InfoFromSMgrRel(reln), forkNum,
&reln->smgr_cached_nblocks[forkNum]);
}
else
{
if (isRedo)
{
update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
get_cached_relsize(InfoFromSMgrRel(reln), forkNum,
&reln->smgr_cached_nblocks[forkNum]);
}
else
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
}
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);
if (debug_compare_local)
{
@@ -958,17 +878,9 @@ neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo)
* unlink, it won't do any harm if the file doesn't exist.
*/
mdunlink(rinfo, forkNum, isRedo);
if (!NRelFileInfoBackendIsTemp(rinfo))
{
if (neon_use_communicator_worker)
{
XLogRecPtr lsn = neon_get_write_lsn();
communicator_new_rel_unlink(InfoFromNInfoB(rinfo), forkNum, lsn);
}
else
forget_cached_relsize(InfoFromNInfoB(rinfo), forkNum);
forget_cached_relsize(InfoFromNInfoB(rinfo), forkNum);
}
}
@@ -1048,6 +960,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
neon_wallog_page(reln, forkNum, n_blocks++, buffer, true);
neon_wallog_page(reln, forkNum, blkno, buffer, false);
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);
lsn = PageGetLSN((Page) buffer);
neon_log(SmgrTrace, "smgrextend called for %u/%u/%u.%u blk %u, page LSN: %X/%08X",
@@ -1055,51 +968,35 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
forkNum, blkno,
(uint32) (lsn >> 32), (uint32) lsn);
if (neon_use_communicator_worker)
lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
if (debug_compare_local)
{
// FIXME: this can pass lsn == invalid. Is that ok?
communicator_new_rel_extend(InfoFromSMgrRel(reln), forkNum, blkno, (const void *) buffer, lsn);
if (debug_compare_local)
{
if (IS_LOCAL_REL(reln))
mdextend(reln, forkNum, blkno, buffer, skipFsync);
}
if (IS_LOCAL_REL(reln))
mdextend(reln, forkNum, blkno, buffer, skipFsync);
}
else
/*
* smgr_extend is often called with an all-zeroes page, so
* lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
* later, after it has been initialized with the real page contents, and
* it is eventually evicted from the buffer cache. But we need a valid LSN
* to the relation metadata update now.
*/
if (lsn == InvalidXLogRecPtr)
{
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blkno + 1);
lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer);
if (debug_compare_local)
{
if (IS_LOCAL_REL(reln))
mdextend(reln, forkNum, blkno, buffer, skipFsync);
}
/*
* smgr_extend is often called with an all-zeroes page, so
* lsn==InvalidXLogRecPtr. An smgr_write() call will come for the buffer
* later, after it has been initialized with the real page contents, and
* it is eventually evicted from the buffer cache. But we need a valid LSN
* to the relation metadata update now.
*/
if (lsn == InvalidXLogRecPtr)
{
lsn = GetXLogInsertRecPtr();
neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno);
}
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
lsn = GetXLogInsertRecPtr();
neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum, blkno);
}
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
}
#if PG_MAJORVERSION_NUM >= 16
static void
neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber start_block,
neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
int nblocks, bool skipFsync)
{
const PGIOAlignedBlock buffer = {0};
BlockNumber blocknum = start_block;
int remblocks = nblocks;
XLogRecPtr lsn = 0;
@@ -1182,14 +1079,11 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber start_block,
lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI);
if (!neon_use_communicator_worker)
for (int i = 0; i < count; i++)
{
for (int i = 0; i < count; i++)
{
lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum,
blocknum + i);
}
lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data);
neon_set_lwlsn_block(lsn, InfoFromSMgrRel(reln), forkNum,
blocknum + i);
}
blocknum += count;
@@ -1198,15 +1092,8 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber start_block,
Assert(lsn != 0);
if (neon_use_communicator_worker)
{
communicator_new_rel_zeroextend(InfoFromSMgrRel(reln), forkNum, start_block, nblocks, lsn);
}
else
{
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
}
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forkNum);
set_cached_relsize(InfoFromSMgrRel(reln), forkNum, blocknum);
}
#endif
@@ -1266,12 +1153,6 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (neon_use_communicator_worker)
{
communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, nblocks);
return false;
}
tag.spcOid = reln->smgr_rlocator.locator.spcOid;
tag.dbOid = reln->smgr_rlocator.locator.dbOid;
tag.relNumber = reln->smgr_rlocator.locator.relNumber;
@@ -1298,8 +1179,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
blocknum += iterblocks;
}
if (!neon_use_communicator_worker)
communicator_prefetch_pump_state();
communicator_prefetch_pump_state();
return false;
}
@@ -1312,6 +1192,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
static bool
neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
{
BufferTag tag;
switch (reln->smgr_relpersistence)
{
case 0: /* probably shouldn't happen, but ignore it */
@@ -1326,25 +1208,17 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (neon_use_communicator_worker)
{
communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, 1);
}
else
{
BufferTag tag;
if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
return false;
if (lfc_cache_contains(InfoFromSMgrRel(reln), forknum, blocknum))
return false;
tag.forkNum = forknum;
tag.blockNum = blocknum;
tag.forkNum = forknum;
tag.blockNum = blocknum;
CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
CopyNRelFileInfoToBufTag(tag, InfoFromSMgrRel(reln));
communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
communicator_prefetch_pump_state();
}
communicator_prefetch_pump_state();
return false;
}
@@ -1388,8 +1262,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
*/
neon_log(SmgrTrace, "writeback noop");
if (!neon_use_communicator_worker)
communicator_prefetch_pump_state();
communicator_prefetch_pump_state();
if (debug_compare_local)
{
@@ -1406,14 +1279,7 @@ void
neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
neon_request_lsns request_lsns, void *buffer)
{
if (neon_use_communicator_worker)
{
// FIXME: request_lsns is ignored. That affects the neon_test_utils callers.
// Add the capability to specify the LSNs explicitly, for the sake of neon_test_utils ?
communicator_new_read_at_lsnv(rinfo, forkNum, blkno, &buffer, 1);
}
else
communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL);
}
static void
@@ -1539,55 +1405,47 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (neon_use_communicator_worker)
/* Try to read PS results if they are available */
communicator_prefetch_pump_state();
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
present = 0;
bufferp = buffer;
if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
{
communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forkNum, blkno,
(void *) &buffer, 1);
/* Prefetch hit */
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
{
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH)
{
return;
}
}
else
/* Try to read from local file cache */
if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
{
/* Try to read PS results if they are available */
communicator_prefetch_pump_state();
neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1);
present = 0;
bufferp = buffer;
if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present))
MyNeonCounters->file_cache_hits_total++;
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
{
/* Prefetch hit */
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
{
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH)
{
return;
}
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
}
/* Try to read from local file cache */
if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer))
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC)
{
MyNeonCounters->file_cache_hits_total++;
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
{
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC)
{
return;
}
return;
}
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
/*
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
*/
communicator_prefetch_pump_state();
}
neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
/*
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
*/
communicator_prefetch_pump_state();
if (debug_compare_local)
{
compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn);
@@ -1650,67 +1508,59 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
nblocks, PG_IOV_MAX);
/* Try to read PS results if they are available */
if (!neon_use_communicator_worker)
communicator_prefetch_pump_state();
communicator_prefetch_pump_state();
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
request_lsns, nblocks);
memset(read_pages, 0, sizeof(read_pages));
if (neon_use_communicator_worker)
prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
blocknum, request_lsns, nblocks,
buffers, read_pages);
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
{
communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum,
buffers, nblocks);
compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
}
else
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH && prefetch_result == nblocks)
{
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
request_lsns, nblocks);
prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
blocknum, request_lsns, nblocks,
buffers, read_pages);
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_PREFETCH)
{
compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_PREFETCH && prefetch_result == nblocks)
{
return;
}
if (debug_compare_local > DEBUG_COMPARE_LOCAL_PREFETCH)
{
memset(read_pages, 0, sizeof(read_pages));
}
/* Try to read from local file cache */
lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
nblocks, read_pages);
if (lfc_result > 0)
MyNeonCounters->file_cache_hits_total += lfc_result;
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
{
compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC && prefetch_result + lfc_result == nblocks)
{
/* Read all blocks from LFC, so we're done */
return;
}
if (debug_compare_local > DEBUG_COMPARE_LOCAL_LFC)
{
memset(read_pages, 0, sizeof(read_pages));
}
communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
buffers, nblocks, read_pages);
/*
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
*/
communicator_prefetch_pump_state();
return;
}
if (debug_compare_local > DEBUG_COMPARE_LOCAL_PREFETCH)
{
memset(read_pages, 0, sizeof(read_pages));
}
/* Try to read from local file cache */
lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
nblocks, read_pages);
if (lfc_result > 0)
MyNeonCounters->file_cache_hits_total += lfc_result;
if (debug_compare_local >= DEBUG_COMPARE_LOCAL_LFC)
{
compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages);
}
if (debug_compare_local <= DEBUG_COMPARE_LOCAL_LFC && prefetch_result + lfc_result == nblocks)
{
/* Read all blocks from LFC, so we're done */
return;
}
if (debug_compare_local > DEBUG_COMPARE_LOCAL_LFC)
{
memset(read_pages, 0, sizeof(read_pages));
}
communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
buffers, nblocks, read_pages);
/*
* Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
*/
communicator_prefetch_pump_state();
if (debug_compare_local)
{
@@ -1811,16 +1661,9 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
forknum, blocknum,
(uint32) (lsn >> 32), (uint32) lsn);
if (neon_use_communicator_worker)
{
communicator_new_write_page(InfoFromSMgrRel(reln), forknum, blocknum, buffer, lsn);
}
else
{
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
communicator_prefetch_pump_state();
}
communicator_prefetch_pump_state();
if (debug_compare_local)
{
@@ -1881,21 +1724,9 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false);
if (neon_use_communicator_worker)
{
for (int i = 0; i < nblocks; i++)
{
XLogRecPtr lsn = PageGetLSN((Page) buffers[i]);
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
communicator_new_write_page(InfoFromSMgrRel(reln), forknum, blkno + i, buffers[i], lsn);
}
}
else
{
lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
communicator_prefetch_pump_state();
}
communicator_prefetch_pump_state();
if (debug_compare_local)
{
@@ -1936,26 +1767,19 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (neon_use_communicator_worker)
if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
{
n_blocks = communicator_new_rel_nblocks(InfoFromSMgrRel(reln), forknum);
neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, n_blocks);
return n_blocks;
}
else
{
if (get_cached_relsize(InfoFromSMgrRel(reln), forknum, &n_blocks))
{
neon_log(SmgrTrace, "cached nblocks for %u/%u/%u.%u: %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
forknum, n_blocks);
return n_blocks;
}
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
neon_get_request_lsns(InfoFromSMgrRel(reln), forknum,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns);
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
}
n_blocks = communicator_nblocks(InfoFromSMgrRel(reln), forknum, &request_lsns);
update_cached_relsize(InfoFromSMgrRel(reln), forknum, n_blocks);
neon_log(SmgrTrace, "neon_nblocks: rel %u/%u/%u fork %u (request LSN %X/%08X): %u blocks",
RelFileInfoFmt(InfoFromSMgrRel(reln)),
@@ -1976,17 +1800,10 @@ neon_dbsize(Oid dbNode)
neon_request_lsns request_lsns;
NRelFileInfo dummy_node = {0};
if (neon_use_communicator_worker)
{
db_size = communicator_new_dbsize(dbNode);
}
else
{
neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
neon_get_request_lsns(dummy_node, MAIN_FORKNUM,
REL_METADATA_PSEUDO_BLOCKNO, &request_lsns, 1);
db_size = communicator_dbsize(dbNode, &request_lsns);
}
db_size = communicator_dbsize(dbNode, &request_lsns);
neon_log(SmgrTrace, "neon_dbsize: db %u (request LSN %X/%08X): %ld bytes",
dbNode, LSN_FORMAT_ARGS(request_lsns.effective_request_lsn), db_size);
@@ -2000,6 +1817,8 @@ neon_dbsize(Oid dbNode)
static void
neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, BlockNumber nblocks)
{
XLogRecPtr lsn;
switch (reln->smgr_relpersistence)
{
case 0:
@@ -2023,45 +1842,34 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
}
if (neon_use_communicator_worker)
{
XLogRecPtr lsn = neon_get_write_lsn();
set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
communicator_new_rel_truncate(InfoFromSMgrRel(reln), forknum, nblocks, lsn);
}
else
{
XLogRecPtr lsn;
/*
* Truncating a relation drops all its buffers from the buffer cache
* without calling smgrwrite() on them. But we must account for that in
* our tracking of last-written-LSN all the same: any future smgrnblocks()
* request must return the new size after the truncation. We don't know
* what the LSN of the truncation record was, so be conservative and use
* the most recently inserted WAL record's LSN.
*/
lsn = GetXLogInsertRecPtr();
lsn = nm_adjust_lsn(lsn);
set_cached_relsize(InfoFromSMgrRel(reln), forknum, nblocks);
/*
* Flush it, too. We don't actually care about it here, but let's uphold
* the invariant that last-written LSN <= flush LSN.
*/
XLogFlush(lsn);
/*
* Truncating a relation drops all its buffers from the buffer cache
* without calling smgrwrite() on them. But we must account for that in
* our tracking of last-written-LSN all the same: any future smgrnblocks()
* request must return the new size after the truncation. We don't know
* what the LSN of the truncation record was, so be conservative and use
* the most recently inserted WAL record's LSN.
*/
lsn = GetXLogInsertRecPtr();
lsn = nm_adjust_lsn(lsn);
/*
* Flush it, too. We don't actually care about it here, but let's uphold
* the invariant that last-written LSN <= flush LSN.
*/
XLogFlush(lsn);
/*
* Truncate may affect several chunks of relations. So we should either
* update last written LSN for all of them, or update LSN for "dummy"
* metadata block. Second approach seems more efficient. If the relation
* is extended again later, the extension will update the last-written LSN
* for the extended pages, so there's no harm in leaving behind obsolete
* entries for the truncated chunks.
*/
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum);
}
/*
* Truncate may affect several chunks of relations. So we should either
* update last written LSN for all of them, or update LSN for "dummy"
* metadata block. Second approach seems more efficient. If the relation
* is extended again later, the extension will update the last-written LSN
* for the extended pages, so there's no harm in leaving behind obsolete
* entries for the truncated chunks.
*/
neon_set_lwlsn_relation(lsn, InfoFromSMgrRel(reln), forknum);
if (debug_compare_local)
{
@@ -2104,8 +1912,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
if (!neon_use_communicator_worker)
communicator_prefetch_pump_state();
communicator_prefetch_pump_state();
if (debug_compare_local)
{
@@ -2291,15 +2098,12 @@ neon_end_unlogged_build(SMgrRelation reln)
nblocks = mdnblocks(reln, MAIN_FORKNUM);
recptr = GetXLogInsertRecPtr();
if (!neon_use_communicator_worker)
{
neon_set_lwlsn_block_range(recptr,
InfoFromNInfoB(rinfob),
MAIN_FORKNUM, 0, nblocks);
neon_set_lwlsn_relation(recptr,
InfoFromNInfoB(rinfob),
MAIN_FORKNUM);
}
neon_set_lwlsn_block_range(recptr,
InfoFromNInfoB(rinfob),
MAIN_FORKNUM, 0, nblocks);
neon_set_lwlsn_relation(recptr,
InfoFromNInfoB(rinfob),
MAIN_FORKNUM);
/* Remove local copy */
for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
@@ -2308,15 +2112,8 @@ neon_end_unlogged_build(SMgrRelation reln)
RelFileInfoFmt(InfoFromNInfoB(rinfob)),
forknum);
if (neon_use_communicator_worker)
{
communicator_new_update_cached_rel_size(InfoFromSMgrRel(reln), forknum, nblocks, recptr);
}
else
{
forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
}
forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
mdclose(reln, forknum);
if (!debug_compare_local)
@@ -2384,10 +2181,7 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
request_lsns.not_modified_since = not_modified_since;
request_lsns.effective_request_lsn = request_lsn;
if (neon_use_communicator_worker)
n_blocks = communicator_new_read_slru_segment(kind, (uint32_t)segno, &request_lsns, path);
else
n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer);
n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer);
return n_blocks;
}
@@ -2424,8 +2218,7 @@ AtEOXact_neon(XactEvent event, void *arg)
}
break;
}
if (!neon_use_communicator_worker)
communicator_reconfigure_timeout_if_needed();
communicator_reconfigure_timeout_if_needed();
}
static const struct f_smgr neon_smgr =
@@ -2483,10 +2276,7 @@ smgr_init_neon(void)
smgr_init_standard();
neon_init();
if (neon_use_communicator_worker)
communicator_new_init();
else
communicator_init();
communicator_init();
}
@@ -2498,20 +2288,6 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
/* This is only used in WAL replay */
Assert(RecoveryInProgress());
if (neon_use_communicator_worker)
{
relsize = communicator_new_rel_nblocks(rinfo, forknum);
if (blkno >= relsize)
communicator_new_rel_zeroextend(rinfo, forknum, relsize, (blkno - relsize) + 1, end_recptr);
/*
* FIXME: does this need to update the last-written LSN too, like the
* old implementation?
*/
return;
}
/* Extend the relation if we know its size */
if (get_cached_relsize(rinfo, forknum, &relsize))
{
@@ -2677,10 +2453,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
* We should perform this check after assigning LwLSN to prevent
* prefetching of some older version of the page by some other backend.
*/
if (neon_use_communicator_worker)
no_redo_needed = communicator_new_cache_contains(rinfo, forknum, blkno);
else
no_redo_needed = !lfc_cache_contains(rinfo, forknum, blkno);
no_redo_needed = !lfc_cache_contains(rinfo, forknum, blkno);
}
LWLockRelease(partitionLock);

View File

@@ -23,7 +23,9 @@
#include "utils/dynahash.h"
#include "utils/guc.h"
#if PG_VERSION_NUM >= 150000
#include "miscadmin.h"
#endif
typedef struct
{
@@ -88,8 +90,6 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
{
bool found = false;
Assert(!neon_use_communicator_worker);
if (relsize_hash_size > 0)
{
RelTag tag;
@@ -121,8 +121,6 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
void
set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
{
Assert(!neon_use_communicator_worker);
if (relsize_hash_size > 0)
{
RelTag tag;
@@ -171,8 +169,6 @@ set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
void
update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
{
Assert(!neon_use_communicator_worker);
if (relsize_hash_size > 0)
{
RelTag tag;
@@ -207,8 +203,6 @@ update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
void
forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum)
{
Assert(!neon_use_communicator_worker);
if (relsize_hash_size > 0)
{
RelTag tag;

View File

@@ -6,16 +6,13 @@ use std::time::Duration;
use anyhow::Context;
use compute_api::spec::PageserverProtocol;
use compute_api::spec::PageserverShardInfo;
use control_plane::endpoint::{
ComputeControlPlane, EndpointStatus, PageserverConnectionInfo, PageserverShardConnectionInfo,
};
use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
use control_plane::local_env::LocalEnv;
use futures::StreamExt;
use hyper::StatusCode;
use pageserver_api::config::DEFAULT_GRPC_LISTEN_PORT;
use pageserver_api::controller_api::AvailabilityZone;
use pageserver_api::shard::{ShardCount, ShardIndex, ShardNumber, ShardStripeSize, TenantShardId};
use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
use postgres_connection::parse_host_port;
use safekeeper_api::membership::SafekeeperGeneration;
use serde::{Deserialize, Serialize};
@@ -509,65 +506,27 @@ impl ApiMethod for ComputeHookTenant {
if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running {
tracing::info!("Reconfiguring pageservers for endpoint {endpoint_name}");
let shard_count = match shards.len() {
0 => panic!("no shards"),
1 => ShardCount::unsharded(),
n => ShardCount(n.try_into().expect("too many shards")),
};
let mut shard_infos: HashMap<ShardIndex, PageserverShardInfo> = HashMap::new();
let prefer_protocol = if endpoint.grpc {
PageserverProtocol::Grpc
} else {
PageserverProtocol::Libpq
};
for shard in shards.iter() {
let ps_conf = env
.get_pageserver_conf(shard.node_id)
.expect("Unknown pageserver");
let libpq_url = Some({
let (host, port) = parse_host_port(&ps_conf.listen_pg_addr)
.expect("Unable to parse listen_pg_addr");
let port = port.unwrap_or(5432);
format!("postgres://no_user@{host}:{port}")
});
let grpc_url = if let Some(grpc_addr) = &ps_conf.listen_grpc_addr {
let (host, port) =
parse_host_port(grpc_addr).expect("invalid gRPC address");
let port = port.unwrap_or(DEFAULT_GRPC_LISTEN_PORT);
Some(format!("grpc://no_user@{host}:{port}"))
} else {
None
};
let pageserver = PageserverShardConnectionInfo {
id: Some(shard.node_id.to_string()),
libpq_url,
grpc_url,
};
let shard_info = PageserverShardInfo {
pageservers: vec![pageserver],
};
shard_infos.insert(
ShardIndex {
shard_number: shard.shard_number,
shard_count,
},
shard_info,
);
}
let pageserver_conninfo = PageserverConnectionInfo {
shard_count,
stripe_size: stripe_size.map(|val| val.0),
shards: shard_infos,
prefer_protocol,
};
let pageservers = shards
.iter()
.map(|shard| {
let ps_conf = env
.get_pageserver_conf(shard.node_id)
.expect("Unknown pageserver");
if endpoint.grpc {
let addr = ps_conf.listen_grpc_addr.as_ref().expect("no gRPC address");
let (host, port) = parse_host_port(addr).expect("invalid gRPC address");
let port = port.unwrap_or(DEFAULT_GRPC_LISTEN_PORT);
(PageserverProtocol::Grpc, host, port)
} else {
let (host, port) = parse_host_port(&ps_conf.listen_pg_addr)
.expect("Unable to parse listen_pg_addr");
(PageserverProtocol::Libpq, host, port.unwrap_or(5432))
}
})
.collect::<Vec<_>>();
endpoint
.reconfigure_pageservers(&pageserver_conninfo)
.reconfigure_pageservers(pageservers, *stripe_size)
.await
.map_err(NotifyError::NeonLocal)?;
}

View File

@@ -4732,18 +4732,7 @@ class Endpoint(PgProtocol, LogUtils):
# set small 'max_replication_write_lag' to enable backpressure
# and make tests more stable.
config_lines += ["max_replication_write_lag=15MB"]
# If gRPC is enabled, use the new communicator too.
#
# NB: the communicator is enabled by default, so force it to false otherwise.
#
# XXX: By checking for None, we enable the new communicator for all tests
# by default
if grpc or grpc is None:
config_lines += ["neon.use_communicator_worker=on"]
else:
config_lines += ["neon.use_communicator_worker=off"]
config_lines = ["max_replication_write_lag=15MB"] + config_lines
# Delete file cache if it exists (and we're recreating the endpoint)
if USE_LFC:

View File

@@ -90,8 +90,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
# During shutdown, DownloadError::Cancelled may be logged as an error. Cleaning this
# up is tracked in https://github.com/neondatabase/neon/issues/6096
".*Cancelled, shutting down.*",
# gRPC request failures during shutdown.
".*grpc:pageservice.*request failed with Unavailable: timeline is shutting down.*",
# Open layers are only rolled at Lsn boundaries to avoid name clashses.
# Hence, we can overshoot the soft limit set by checkpoint distance.
# This is especially pronounced in tests that set small checkpoint
@@ -154,8 +152,6 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
".*reconciler.*neon_local error.*",
# Tenant rate limits may fire in tests that submit lots of API requests.
".*tenant \\S+ is rate limited.*",
# Reconciliations may get stuck/delayed e.g. in chaos tests.
".*background_reconcile: Shard reconciliation is stuck.*",
]

View File

@@ -16,7 +16,6 @@ def test_gin_redo(neon_simple_env: NeonEnv):
secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
con = primary.connect()
cur = con.cursor()
cur.execute("select pg_switch_wal()")
cur.execute("create table gin_test_tbl(id integer, i int4[])")
cur.execute("create index gin_test_idx on gin_test_tbl using gin (i)")
cur.execute("insert into gin_test_tbl select g,array[3, 1, g] from generate_series(1, 10000) g")

View File

@@ -17,9 +17,7 @@ def check_tenant(
config_lines = [
f"neon.safekeeper_proto_version = {safekeeper_proto_version}",
]
endpoint = env.endpoints.create_start(
"main", tenant_id=tenant_id, config_lines=config_lines, grpc=True
)
endpoint = env.endpoints.create_start("main", tenant_id=tenant_id, config_lines=config_lines)
# we rely upon autocommit after each statement
res_1 = endpoint.safe_psql_many(
queries=[

View File

@@ -28,7 +28,6 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
clap = { version = "4", features = ["derive", "env", "string"] }
clap_builder = { version = "4", default-features = false, features = ["color", "env", "help", "std", "string", "suggestions", "usage"] }
const-oid = { version = "0.9", default-features = false, features = ["db", "std"] }
criterion = { version = "0.5", features = ["html_reports"] }
crypto-bigint = { version = "0.5", features = ["generic-array", "zeroize"] }
der = { version = "0.7", default-features = false, features = ["derive", "flagset", "oid", "pem", "std"] }
deranged = { version = "0.3", default-features = false, features = ["powerfmt", "serde", "std"] }
@@ -71,6 +70,7 @@ num-integer = { version = "0.1", features = ["i128"] }
num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] }
num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] }
num-traits = { version = "0.2", features = ["i128", "libm"] }
once_cell = { version = "1" }
p256 = { version = "0.13", features = ["jwk"] }
parquet = { version = "53", default-features = false, features = ["zstd"] }
prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] }
@@ -102,7 +102,7 @@ tokio-rustls = { version = "0.26", default-features = false, features = ["loggin
tokio-stream = { version = "0.1", features = ["net", "sync"] }
tokio-util = { version = "0.7", features = ["codec", "compat", "io-util", "rt"] }
toml_edit = { version = "0.22", features = ["serde"] }
tonic = { version = "0.13", default-features = false, features = ["codegen", "gzip", "prost", "router", "tls-native-roots", "tls-ring", "transport", "zstd"] }
tonic = { version = "0.13", default-features = false, features = ["codegen", "gzip", "prost", "router", "server", "tls-native-roots", "tls-ring", "zstd"] }
tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "log"] }
tracing = { version = "0.1", features = ["log"] }
tracing-core = { version = "0.1" }
@@ -140,6 +140,7 @@ num-integer = { version = "0.1", features = ["i128"] }
num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] }
num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] }
num-traits = { version = "0.2", features = ["i128", "libm"] }
once_cell = { version = "1" }
parquet = { version = "53", default-features = false, features = ["zstd"] }
prettyplease = { version = "0.2", default-features = false, features = ["verbatim"] }
proc-macro2 = { version = "1" }