Compare commits

..

1 Commits

Author SHA1 Message Date
Vlad Lazar
6a6a6fa659 repro 2025-02-20 18:56:34 +01:00
51 changed files with 249 additions and 970 deletions

19
Cargo.lock generated
View File

@@ -1874,12 +1874,6 @@ dependencies = [
"syn 2.0.90",
]
[[package]]
name = "difflib"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8"
[[package]]
name = "digest"
version = "0.10.7"
@@ -3337,17 +3331,6 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "json-structural-diff"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e878e36a8a44c158505c2c818abdc1350413ad83dcb774a0459f6a7ef2b65cbf"
dependencies = [
"difflib",
"regex",
"serde_json",
]
[[package]]
name = "jsonwebtoken"
version = "9.2.0"
@@ -6460,7 +6443,6 @@ dependencies = [
"humantime",
"hyper 0.14.30",
"itertools 0.10.5",
"json-structural-diff",
"lasso",
"measured",
"metrics",
@@ -7634,7 +7616,6 @@ dependencies = [
"once_cell",
"pin-project-lite",
"postgres_connection",
"pprof",
"pq_proto",
"rand 0.8.5",
"regex",

View File

@@ -210,7 +210,6 @@ rustls-native-certs = "0.8"
x509-parser = "0.16"
whoami = "1.5.1"
zerocopy = { version = "0.7", features = ["derive"] }
json-structural-diff = { version = "0.2.0" }
## TODO replace this with tracing
env_logger = "0.10"

View File

@@ -292,7 +292,7 @@ WORKDIR /home/nonroot
# Rust
# Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
ENV RUSTC_VERSION=1.85.0
ENV RUSTC_VERSION=1.84.1
ENV RUSTUP_HOME="/home/nonroot/.rustup"
ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
ARG RUSTFILT_VERSION=0.2.1

View File

@@ -1669,11 +1669,7 @@ COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
# Disabled temporarily, because it clashed with pg_mooncake. pg_mooncake
# also depends on libduckdb, but a different version.
#COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pgaudit-build /usr/local/pgsql/ /usr/local/pgsql/
COPY --from=pgauditlogtofile-build /usr/local/pgsql/ /usr/local/pgsql/

View File

@@ -41,6 +41,7 @@ use std::process::exit;
use std::str::FromStr;
use std::sync::atomic::Ordering;
use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
use std::time::SystemTime;
use std::{thread, time::Duration};
use anyhow::{Context, Result};
@@ -85,6 +86,19 @@ fn parse_remote_ext_config(arg: &str) -> Result<String> {
}
}
/// Generate a compute ID if one is not supplied. This exists to keep forward
/// compatibility tests working, but will be removed in a future iteration.
fn generate_compute_id() -> String {
let now = SystemTime::now();
format!(
"compute-{}",
now.duration_since(SystemTime::UNIX_EPOCH)
.unwrap()
.as_secs()
)
}
#[derive(Parser)]
#[command(rename_all = "kebab-case")]
struct Cli {
@@ -98,13 +112,16 @@ struct Cli {
/// outside the compute will talk to the compute through this port. Keep
/// the previous name for this argument around for a smoother release
/// with the control plane.
#[arg(long, default_value_t = 3080)]
///
/// TODO: Remove the alias after the control plane release which teaches the
/// control plane about the renamed argument.
#[arg(long, alias = "http-port", default_value_t = 3080)]
pub external_http_port: u16,
/// The port to bind the internal listening HTTP server to. Clients include
/// The port to bind the internal listening HTTP server to. Clients like
/// the neon extension (for installing remote extensions) and local_proxy.
#[arg(long, default_value_t = 3081)]
pub internal_http_port: u16,
#[arg(long)]
pub internal_http_port: Option<u16>,
#[arg(short = 'D', long, value_name = "DATADIR")]
pub pgdata: String,
@@ -139,7 +156,7 @@ struct Cli {
#[arg(short = 'S', long, group = "spec-path")]
pub spec_path: Option<OsString>,
#[arg(short = 'i', long, group = "compute-id")]
#[arg(short = 'i', long, group = "compute-id", default_value = generate_compute_id())]
pub compute_id: String,
#[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")]
@@ -342,7 +359,7 @@ fn wait_spec(
pgbin: cli.pgbin.clone(),
pgversion: get_pg_version_string(&cli.pgbin),
external_http_port: cli.external_http_port,
internal_http_port: cli.internal_http_port,
internal_http_port: cli.internal_http_port.unwrap_or(cli.external_http_port + 1),
live_config_allowed,
state: Mutex::new(new_state),
state_changed: Condvar::new(),
@@ -366,7 +383,7 @@ fn wait_spec(
// The internal HTTP server could be launched later, but there isn't much
// sense in waiting.
Server::Internal(cli.internal_http_port).launch(&compute);
Server::Internal(cli.internal_http_port.unwrap_or(cli.external_http_port + 1)).launch(&compute);
if !spec_set {
// No spec provided, hang waiting for it.

View File

@@ -2,7 +2,6 @@ DO $$
DECLARE
subname TEXT;
BEGIN
LOCK TABLE pg_subscription IN ACCESS EXCLUSIVE MODE;
FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP
EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname);
EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);

View File

@@ -46,8 +46,6 @@ use std::process::Command;
use std::str::FromStr;
use std::sync::Arc;
use std::time::Duration;
use std::time::SystemTime;
use std::time::UNIX_EPOCH;
use anyhow::{anyhow, bail, Context, Result};
use compute_api::requests::ConfigurationRequest;
@@ -61,7 +59,6 @@ use nix::sys::signal::Signal;
use pageserver_api::shard::ShardStripeSize;
use reqwest::header::CONTENT_TYPE;
use serde::{Deserialize, Serialize};
use tracing::debug;
use url::Host;
use utils::id::{NodeId, TenantId, TimelineId};
@@ -84,10 +81,8 @@ pub struct EndpointConf {
internal_http_port: u16,
pg_version: u32,
skip_pg_catalog_updates: bool,
reconfigure_concurrency: usize,
drop_subscriptions_before_start: bool,
features: Vec<ComputeFeature>,
cluster: Option<Cluster>,
}
//
@@ -184,9 +179,7 @@ impl ComputeControlPlane {
// we also skip catalog updates in the cloud.
skip_pg_catalog_updates,
drop_subscriptions_before_start,
reconfigure_concurrency: 1,
features: vec![],
cluster: None,
});
ep.create_endpoint_dir()?;
@@ -203,9 +196,7 @@ impl ComputeControlPlane {
pg_version,
skip_pg_catalog_updates,
drop_subscriptions_before_start,
reconfigure_concurrency: 1,
features: vec![],
cluster: None,
})?,
)?;
std::fs::write(
@@ -270,11 +261,8 @@ pub struct Endpoint {
skip_pg_catalog_updates: bool,
drop_subscriptions_before_start: bool,
reconfigure_concurrency: usize,
// Feature flags
features: Vec<ComputeFeature>,
// Cluster settings
cluster: Option<Cluster>,
}
#[derive(PartialEq, Eq)]
@@ -314,8 +302,6 @@ impl Endpoint {
let conf: EndpointConf =
serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;
debug!("serialized endpoint conf: {:?}", conf);
Ok(Endpoint {
pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), conf.pg_port),
external_http_address: SocketAddr::new(
@@ -333,10 +319,8 @@ impl Endpoint {
tenant_id: conf.tenant_id,
pg_version: conf.pg_version,
skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
reconfigure_concurrency: conf.reconfigure_concurrency,
drop_subscriptions_before_start: conf.drop_subscriptions_before_start,
features: conf.features,
cluster: conf.cluster,
})
}
@@ -623,7 +607,7 @@ impl Endpoint {
};
// Create spec file
let mut spec = ComputeSpec {
let spec = ComputeSpec {
skip_pg_catalog_updates: self.skip_pg_catalog_updates,
format_version: 1.0,
operation_uuid: None,
@@ -656,7 +640,7 @@ impl Endpoint {
Vec::new()
},
settings: None,
postgresql_conf: Some(postgresql_conf.clone()),
postgresql_conf: Some(postgresql_conf),
},
delta_operations: None,
tenant_id: Some(self.tenant_id),
@@ -669,35 +653,9 @@ impl Endpoint {
pgbouncer_settings: None,
shard_stripe_size: Some(shard_stripe_size),
local_proxy_config: None,
reconfigure_concurrency: self.reconfigure_concurrency,
reconfigure_concurrency: 1,
drop_subscriptions_before_start: self.drop_subscriptions_before_start,
};
// this strange code is needed to support respec() in tests
if self.cluster.is_some() {
debug!("Cluster is already set in the endpoint spec, using it");
spec.cluster = self.cluster.clone().unwrap();
debug!("spec.cluster {:?}", spec.cluster);
// fill missing fields again
if create_test_user {
spec.cluster.roles.push(Role {
name: PgIdent::from_str("test").unwrap(),
encrypted_password: None,
options: None,
});
spec.cluster.databases.push(Database {
name: PgIdent::from_str("neondb").unwrap(),
owner: PgIdent::from_str("test").unwrap(),
options: None,
restrict_conn: false,
invalid: false,
});
}
spec.cluster.postgresql_conf = Some(postgresql_conf);
}
let spec_path = self.endpoint_path().join("spec.json");
std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -715,14 +673,18 @@ impl Endpoint {
println!("Also at '{}'", conn_str);
}
let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
//cmd.args([
// "--external-http-port",
// &self.external_http_address.port().to_string(),
//])
//.args([
// "--internal-http-port",
// &self.internal_http_address.port().to_string(),
//])
cmd.args([
"--external-http-port",
"--http-port",
&self.external_http_address.port().to_string(),
])
.args([
"--internal-http-port",
&self.internal_http_address.port().to_string(),
])
.args(["--pgdata", self.pgdata().to_str().unwrap()])
.args(["--connstr", &conn_str])
.args([
@@ -739,16 +701,20 @@ impl Endpoint {
])
// TODO: It would be nice if we generated compute IDs with the same
// algorithm as the real control plane.
.args([
"--compute-id",
&format!(
"compute-{}",
SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_secs()
),
])
//
// TODO: Add this back when
// https://github.com/neondatabase/neon/pull/10747 is merged.
//
//.args([
// "--compute-id",
// &format!(
// "compute-{}",
// SystemTime::now()
// .duration_since(UNIX_EPOCH)
// .unwrap()
// .as_secs()
// ),
//])
.stdin(std::process::Stdio::null())
.stderr(logfile.try_clone()?)
.stdout(logfile);

View File

@@ -335,21 +335,13 @@ impl PageServerNode {
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'checkpoint_distance' as an integer")?,
checkpoint_timeout: settings
.remove("checkpoint_timeout")
.map(humantime::parse_duration)
.transpose()
.context("Failed to parse 'checkpoint_timeout' as duration")?,
checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()),
compaction_target_size: settings
.remove("compaction_target_size")
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'compaction_target_size' as an integer")?,
compaction_period: settings
.remove("compaction_period")
.map(humantime::parse_duration)
.transpose()
.context("Failed to parse 'compaction_period' as duration")?,
compaction_period: settings.remove("compaction_period").map(|x| x.to_string()),
compaction_threshold: settings
.remove("compaction_threshold")
.map(|x| x.parse::<usize>())
@@ -395,10 +387,7 @@ impl PageServerNode {
.map(|x| x.parse::<u64>())
.transpose()
.context("Failed to parse 'gc_horizon' as an integer")?,
gc_period: settings.remove("gc_period")
.map(humantime::parse_duration)
.transpose()
.context("Failed to parse 'gc_period' as duration")?,
gc_period: settings.remove("gc_period").map(|x| x.to_string()),
image_creation_threshold: settings
.remove("image_creation_threshold")
.map(|x| x.parse::<usize>())
@@ -414,20 +403,13 @@ impl PageServerNode {
.map(|x| x.parse::<usize>())
.transpose()
.context("Failed to parse 'image_creation_preempt_threshold' as integer")?,
pitr_interval: settings.remove("pitr_interval")
.map(humantime::parse_duration)
.transpose()
.context("Failed to parse 'pitr_interval' as duration")?,
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
walreceiver_connect_timeout: settings
.remove("walreceiver_connect_timeout")
.map(humantime::parse_duration)
.transpose()
.context("Failed to parse 'walreceiver_connect_timeout' as duration")?,
.map(|x| x.to_string()),
lagging_wal_timeout: settings
.remove("lagging_wal_timeout")
.map(humantime::parse_duration)
.transpose()
.context("Failed to parse 'lagging_wal_timeout' as duration")?,
.map(|x| x.to_string()),
max_lsn_wal_lag: settings
.remove("max_lsn_wal_lag")
.map(|x| x.parse::<NonZeroU64>())
@@ -445,14 +427,8 @@ impl PageServerNode {
.context("Failed to parse 'min_resident_size_override' as integer")?,
evictions_low_residence_duration_metric_threshold: settings
.remove("evictions_low_residence_duration_metric_threshold")
.map(humantime::parse_duration)
.transpose()
.context("Failed to parse 'evictions_low_residence_duration_metric_threshold' as duration")?,
heatmap_period: settings
.remove("heatmap_period")
.map(humantime::parse_duration)
.transpose()
.context("Failed to parse 'heatmap_period' as duration")?,
.map(|x| x.to_string()),
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
lazy_slru_download: settings
.remove("lazy_slru_download")
.map(|x| x.parse::<bool>())
@@ -463,15 +439,10 @@ impl PageServerNode {
.map(serde_json::from_str)
.transpose()
.context("parse `timeline_get_throttle` from json")?,
lsn_lease_length: settings.remove("lsn_lease_length")
.map(humantime::parse_duration)
.transpose()
.context("Failed to parse 'lsn_lease_length' as duration")?,
lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
lsn_lease_length_for_ts: settings
.remove("lsn_lease_length_for_ts")
.map(humantime::parse_duration)
.transpose()
.context("Failed to parse 'lsn_lease_length_for_ts' as duration")?,
.map(|x| x.to_string()),
timeline_offloading: settings
.remove("timeline_offloading")
.map(|x| x.parse::<bool>())

View File

@@ -47,9 +47,6 @@ enum Command {
listen_http_addr: String,
#[arg(long)]
listen_http_port: u16,
#[arg(long)]
listen_https_port: Option<u16>,
#[arg(long)]
availability_zone_id: String,
},
@@ -397,7 +394,6 @@ async fn main() -> anyhow::Result<()> {
listen_pg_port,
listen_http_addr,
listen_http_port,
listen_https_port,
availability_zone_id,
} => {
storcon_client
@@ -410,7 +406,6 @@ async fn main() -> anyhow::Result<()> {
listen_pg_port,
listen_http_addr,
listen_http_port,
listen_https_port,
availability_zone_id: AvailabilityZone(availability_zone_id),
}),
)
@@ -959,7 +954,7 @@ async fn main() -> anyhow::Result<()> {
threshold: threshold.into(),
},
)),
heatmap_period: Some(Duration::from_secs(300)),
heatmap_period: Some("300s".to_string()),
..Default::default()
},
})

View File

@@ -77,5 +77,4 @@ echo "Start compute node"
/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
-C "postgresql://cloud_admin@localhost:55433/postgres" \
-b /usr/local/bin/postgres \
--compute-id "compute-$RANDOM" \
-S ${SPEC_FILE}

View File

@@ -252,7 +252,7 @@ pub enum ComputeMode {
Replica,
}
#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
#[derive(Clone, Debug, Default, Deserialize, Serialize)]
pub struct Cluster {
pub cluster_id: Option<String>,
pub name: Option<String>,
@@ -283,7 +283,7 @@ pub struct DeltaOp {
/// Rust representation of Postgres role info with only those fields
/// that matter for us.
#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct Role {
pub name: PgIdent,
pub encrypted_password: Option<String>,
@@ -292,7 +292,7 @@ pub struct Role {
/// Rust representation of Postgres database info with only those fields
/// that matter for us.
#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct Database {
pub name: PgIdent,
pub owner: PgIdent,
@@ -308,7 +308,7 @@ pub struct Database {
/// Common type representing both SQL statement params with or without value,
/// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
/// options like `wal_level = logical`.
#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct GenericOption {
pub name: String,
pub value: Option<String>,

View File

@@ -122,8 +122,6 @@ pub struct ConfigToml {
pub page_service_pipelining: PageServicePipeliningConfig,
pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
pub enable_read_path_debugging: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub validate_wal_contiguity: Option<bool>,
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -523,7 +521,6 @@ impl Default for ConfigToml {
} else {
None
},
validate_wal_contiguity: None,
}
}
}

View File

@@ -57,7 +57,6 @@ pub struct NodeRegisterRequest {
pub listen_http_addr: String,
pub listen_http_port: u16,
pub listen_https_port: Option<u16>,
pub availability_zone_id: AvailabilityZone,
}
@@ -106,7 +105,6 @@ pub struct TenantLocateResponseShard {
pub listen_http_addr: String,
pub listen_http_port: u16,
pub listen_https_port: Option<u16>,
}
#[derive(Serialize, Deserialize)]
@@ -150,7 +148,6 @@ pub struct NodeDescribeResponse {
pub listen_http_addr: String,
pub listen_http_port: u16,
pub listen_https_port: Option<u16>,
pub listen_pg_addr: String,
pub listen_pg_port: u16,

View File

@@ -526,13 +526,9 @@ pub struct TenantConfigPatch {
#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
pub struct TenantConfig {
pub checkpoint_distance: Option<u64>,
#[serde(default)]
#[serde(with = "humantime_serde")]
pub checkpoint_timeout: Option<Duration>,
pub checkpoint_timeout: Option<String>,
pub compaction_target_size: Option<u64>,
#[serde(default)]
#[serde(with = "humantime_serde")]
pub compaction_period: Option<Duration>,
pub compaction_period: Option<String>,
pub compaction_threshold: Option<usize>,
pub compaction_upper_limit: Option<usize>,
// defer parsing compaction_algorithm, like eviction_policy
@@ -543,38 +539,22 @@ pub struct TenantConfig {
pub l0_flush_stall_threshold: Option<usize>,
pub l0_flush_wait_upload: Option<bool>,
pub gc_horizon: Option<u64>,
#[serde(default)]
#[serde(with = "humantime_serde")]
pub gc_period: Option<Duration>,
pub gc_period: Option<String>,
pub image_creation_threshold: Option<usize>,
#[serde(default)]
#[serde(with = "humantime_serde")]
pub pitr_interval: Option<Duration>,
#[serde(default)]
#[serde(with = "humantime_serde")]
pub walreceiver_connect_timeout: Option<Duration>,
#[serde(default)]
#[serde(with = "humantime_serde")]
pub lagging_wal_timeout: Option<Duration>,
pub pitr_interval: Option<String>,
pub walreceiver_connect_timeout: Option<String>,
pub lagging_wal_timeout: Option<String>,
pub max_lsn_wal_lag: Option<NonZeroU64>,
pub eviction_policy: Option<EvictionPolicy>,
pub min_resident_size_override: Option<u64>,
#[serde(default)]
#[serde(with = "humantime_serde")]
pub evictions_low_residence_duration_metric_threshold: Option<Duration>,
#[serde(default)]
#[serde(with = "humantime_serde")]
pub heatmap_period: Option<Duration>,
pub evictions_low_residence_duration_metric_threshold: Option<String>,
pub heatmap_period: Option<String>,
pub lazy_slru_download: Option<bool>,
pub timeline_get_throttle: Option<ThrottleConfig>,
pub image_layer_creation_check_threshold: Option<u8>,
pub image_creation_preempt_threshold: Option<usize>,
#[serde(default)]
#[serde(with = "humantime_serde")]
pub lsn_lease_length: Option<Duration>,
#[serde(default)]
#[serde(with = "humantime_serde")]
pub lsn_lease_length_for_ts: Option<Duration>,
pub lsn_lease_length: Option<String>,
pub lsn_lease_length_for_ts: Option<String>,
pub timeline_offloading: Option<bool>,
pub wal_receiver_protocol_override: Option<PostgresClientProtocol>,
pub rel_size_v2_enabled: Option<bool>,
@@ -584,10 +564,7 @@ pub struct TenantConfig {
}
impl TenantConfig {
pub fn apply_patch(
self,
patch: TenantConfigPatch,
) -> Result<TenantConfig, humantime::DurationError> {
pub fn apply_patch(self, patch: TenantConfigPatch) -> TenantConfig {
let Self {
mut checkpoint_distance,
mut checkpoint_timeout,
@@ -627,17 +604,11 @@ impl TenantConfig {
} = self;
patch.checkpoint_distance.apply(&mut checkpoint_distance);
patch
.checkpoint_timeout
.map(|v| humantime::parse_duration(&v))?
.apply(&mut checkpoint_timeout);
patch.checkpoint_timeout.apply(&mut checkpoint_timeout);
patch
.compaction_target_size
.apply(&mut compaction_target_size);
patch
.compaction_period
.map(|v| humantime::parse_duration(&v))?
.apply(&mut compaction_period);
patch.compaction_period.apply(&mut compaction_period);
patch.compaction_threshold.apply(&mut compaction_threshold);
patch
.compaction_upper_limit
@@ -655,25 +626,15 @@ impl TenantConfig {
.apply(&mut l0_flush_stall_threshold);
patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload);
patch.gc_horizon.apply(&mut gc_horizon);
patch
.gc_period
.map(|v| humantime::parse_duration(&v))?
.apply(&mut gc_period);
patch.gc_period.apply(&mut gc_period);
patch
.image_creation_threshold
.apply(&mut image_creation_threshold);
patch
.pitr_interval
.map(|v| humantime::parse_duration(&v))?
.apply(&mut pitr_interval);
patch.pitr_interval.apply(&mut pitr_interval);
patch
.walreceiver_connect_timeout
.map(|v| humantime::parse_duration(&v))?
.apply(&mut walreceiver_connect_timeout);
patch
.lagging_wal_timeout
.map(|v| humantime::parse_duration(&v))?
.apply(&mut lagging_wal_timeout);
patch.lagging_wal_timeout.apply(&mut lagging_wal_timeout);
patch.max_lsn_wal_lag.apply(&mut max_lsn_wal_lag);
patch.eviction_policy.apply(&mut eviction_policy);
patch
@@ -681,12 +642,8 @@ impl TenantConfig {
.apply(&mut min_resident_size_override);
patch
.evictions_low_residence_duration_metric_threshold
.map(|v| humantime::parse_duration(&v))?
.apply(&mut evictions_low_residence_duration_metric_threshold);
patch
.heatmap_period
.map(|v| humantime::parse_duration(&v))?
.apply(&mut heatmap_period);
patch.heatmap_period.apply(&mut heatmap_period);
patch.lazy_slru_download.apply(&mut lazy_slru_download);
patch
.timeline_get_throttle
@@ -697,13 +654,9 @@ impl TenantConfig {
patch
.image_creation_preempt_threshold
.apply(&mut image_creation_preempt_threshold);
patch
.lsn_lease_length
.map(|v| humantime::parse_duration(&v))?
.apply(&mut lsn_lease_length);
patch.lsn_lease_length.apply(&mut lsn_lease_length);
patch
.lsn_lease_length_for_ts
.map(|v| humantime::parse_duration(&v))?
.apply(&mut lsn_lease_length_for_ts);
patch.timeline_offloading.apply(&mut timeline_offloading);
patch
@@ -720,7 +673,7 @@ impl TenantConfig {
.gc_compaction_ratio_percent
.apply(&mut gc_compaction_ratio_percent);
Ok(Self {
Self {
checkpoint_distance,
checkpoint_timeout,
compaction_target_size,
@@ -756,7 +709,7 @@ impl TenantConfig {
gc_compaction_enabled,
gc_compaction_initial_threshold_kb,
gc_compaction_ratio_percent,
})
}
}
}
@@ -2550,7 +2503,7 @@ mod tests {
..base.clone()
};
let patched = base.apply_patch(decoded.config).unwrap();
let patched = base.apply_patch(decoded.config);
assert_eq!(patched, expected);
}

View File

@@ -27,7 +27,7 @@ humantime.workspace = true
fail.workspace = true
futures = { workspace = true }
jsonwebtoken.workspace = true
nix = { workspace = true, features = ["ioctl"] }
nix = {workspace = true, features = [ "ioctl" ] }
once_cell.workspace = true
pin-project-lite.workspace = true
regex.workspace = true
@@ -61,7 +61,6 @@ bytes.workspace = true
criterion.workspace = true
hex-literal.workspace = true
camino-tempfile.workspace = true
pprof.workspace = true
serde_assert.workspace = true
tokio = { workspace = true, features = ["test-util"] }

View File

@@ -1,26 +0,0 @@
## Utils Benchmarks
To run benchmarks:
```sh
# All benchmarks.
cargo bench --package utils
# Specific file.
cargo bench --package utils --bench benchmarks
# Specific benchmark.
cargo bench --package utils --bench benchmarks warn_slow/enabled=true
# List available benchmarks.
cargo bench --package utils --benches -- --list
# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds.
# Output in target/criterion/*/profile/flamegraph.svg.
cargo bench --package utils --bench benchmarks warn_slow/enabled=true --profile-time 10
```
Additional charts and statistics are available in `target/criterion/report/index.html`.
Benchmarks are automatically compared against the previous run. To compare against other runs, see
`--baseline` and `--save-baseline`.

View File

@@ -1,18 +1,5 @@
use std::time::Duration;
use criterion::{criterion_group, criterion_main, Bencher, Criterion};
use pprof::criterion::{Output, PProfProfiler};
use criterion::{criterion_group, criterion_main, Criterion};
use utils::id;
use utils::logging::warn_slow;
// Register benchmarks with Criterion.
criterion_group!(
name = benches;
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = bench_id_stringify,
bench_warn_slow,
);
criterion_main!(benches);
pub fn bench_id_stringify(c: &mut Criterion) {
// Can only use public methods.
@@ -29,31 +16,5 @@ pub fn bench_id_stringify(c: &mut Criterion) {
});
}
pub fn bench_warn_slow(c: &mut Criterion) {
for enabled in [false, true] {
c.bench_function(&format!("warn_slow/enabled={enabled}"), |b| {
run_bench(b, enabled).unwrap()
});
}
// The actual benchmark.
fn run_bench(b: &mut Bencher, enabled: bool) -> anyhow::Result<()> {
const THRESHOLD: Duration = Duration::from_secs(1);
// Use a multi-threaded runtime to avoid thread parking overhead when yielding.
let runtime = tokio::runtime::Builder::new_multi_thread()
.enable_all()
.build()?;
// Test both with and without warn_slow, since we're essentially measuring Tokio scheduling
// performance too. Use a simple noop future that yields once, to avoid any scheduler fast
// paths for a ready future.
if enabled {
b.iter(|| runtime.block_on(warn_slow("ready", THRESHOLD, tokio::task::yield_now())));
} else {
b.iter(|| runtime.block_on(tokio::task::yield_now()));
}
Ok(())
}
}
criterion_group!(benches, bench_id_stringify);
criterion_main!(benches);

View File

@@ -1,13 +1,9 @@
use std::future::Future;
use std::str::FromStr;
use std::time::Duration;
use anyhow::Context;
use metrics::{IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
use strum_macros::{EnumString, VariantNames};
use tokio::time::Instant;
use tracing::warn;
/// Logs a critical error, similarly to `tracing::error!`. This will:
///
@@ -322,41 +318,6 @@ impl std::fmt::Debug for SecretString {
}
}
/// Logs a periodic warning if a future is slow to complete.
///
/// This is performance-sensitive as it's used on the GetPage read path.
#[inline]
pub async fn warn_slow<O>(name: &str, threshold: Duration, f: impl Future<Output = O>) -> O {
// TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and
// won't fit on the stack.
let mut f = Box::pin(f);
let started = Instant::now();
let mut attempt = 1;
loop {
// NB: use timeout_at() instead of timeout() to avoid an extra clock reading in the common
// case where the timeout doesn't fire.
let deadline = started + attempt * threshold;
if let Ok(output) = tokio::time::timeout_at(deadline, &mut f).await {
// NB: we check if we exceeded the threshold even if the timeout never fired, because
// scheduling or execution delays may cause the future to succeed even if it exceeds the
// timeout. This costs an extra unconditional clock reading, but seems worth it to avoid
// false negatives.
let elapsed = started.elapsed();
if elapsed >= threshold {
warn!("slow {name} completed after {:.3}s", elapsed.as_secs_f64());
}
return output;
}
let elapsed = started.elapsed().as_secs_f64();
warn!("slow {name} still running after {elapsed:.3}s",);
attempt += 1;
}
}
#[cfg(test)]
mod tests {
use metrics::{core::Opts, IntCounterVec};

View File

@@ -5,7 +5,6 @@ package interpreted_wal;
message InterpretedWalRecords {
repeated InterpretedWalRecord records = 1;
optional uint64 next_record_lsn = 2;
optional uint64 raw_wal_start_lsn = 3;
}
message InterpretedWalRecord {

View File

@@ -60,11 +60,7 @@ pub struct InterpretedWalRecords {
pub records: Vec<InterpretedWalRecord>,
// Start LSN of the next record after the batch.
// Note that said record may not belong to the current shard.
pub next_record_lsn: Lsn,
// Inclusive start LSN of the PG WAL from which the interpreted
// WAL records were extracted. Note that this is not necessarily the
// start LSN of the first interpreted record in the batch.
pub raw_wal_start_lsn: Option<Lsn>,
pub next_record_lsn: Option<Lsn>,
}
/// An interpreted Postgres WAL record, ready to be handled by the pageserver

View File

@@ -167,8 +167,7 @@ impl TryFrom<InterpretedWalRecords> for proto::InterpretedWalRecords {
.collect::<Result<Vec<_>, _>>()?;
Ok(proto::InterpretedWalRecords {
records,
next_record_lsn: Some(value.next_record_lsn.0),
raw_wal_start_lsn: value.raw_wal_start_lsn.map(|l| l.0),
next_record_lsn: value.next_record_lsn.map(|l| l.0),
})
}
}
@@ -255,11 +254,7 @@ impl TryFrom<proto::InterpretedWalRecords> for InterpretedWalRecords {
Ok(InterpretedWalRecords {
records,
next_record_lsn: value
.next_record_lsn
.map(Lsn::from)
.expect("Always provided"),
raw_wal_start_lsn: value.raw_wal_start_lsn.map(Lsn::from),
next_record_lsn: value.next_record_lsn.map(Lsn::from),
})
}
}

View File

@@ -134,7 +134,6 @@ fn main() -> anyhow::Result<()> {
info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
info!(?conf.validate_wal_contiguity, "starting with WAL contiguity validation");
info!(?conf.page_service_pipelining, "starting with page service pipelining config");
info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config");

View File

@@ -197,10 +197,6 @@ pub struct PageServerConf {
/// Enable read path debugging. If enabled, read key errors will print a backtrace of the layer
/// files read.
pub enable_read_path_debugging: bool,
/// Interpreted protocol feature: if enabled, validate that the logical WAL received from
/// safekeepers does not have gaps.
pub validate_wal_contiguity: bool,
}
/// Token for authentication to safekeepers
@@ -364,7 +360,6 @@ impl PageServerConf {
page_service_pipelining,
get_vectored_concurrent_io,
enable_read_path_debugging,
validate_wal_contiguity,
} = config_toml;
let mut conf = PageServerConf {
@@ -451,7 +446,6 @@ impl PageServerConf {
virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
no_sync: no_sync.unwrap_or(false),
enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false),
};
// ------------------------------------------------------------

View File

@@ -173,7 +173,6 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
listen_pg_port: m.postgres_port,
listen_http_addr: m.http_host,
listen_http_port: m.http_port,
listen_https_port: None, // TODO: Support https.
availability_zone_id: az_id.expect("Checked above"),
})
}

View File

@@ -34,13 +34,11 @@ use std::str::FromStr;
use std::sync::Arc;
use std::time::SystemTime;
use std::time::{Duration, Instant};
use strum_macros::IntoStaticStr;
use tokio::io::{AsyncRead, AsyncWrite};
use tokio::io::{AsyncWriteExt, BufWriter};
use tokio::task::JoinHandle;
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::logging::warn_slow;
use utils::sync::gate::{Gate, GateGuard};
use utils::sync::spsc_fold;
use utils::{
@@ -83,9 +81,6 @@ use std::os::fd::AsRawFd;
/// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
/// Threshold at which to log a warning about slow GetPage requests.
const WARN_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30);
///////////////////////////////////////////////////////////////////////////////
pub struct Listener {
@@ -599,7 +594,6 @@ struct BatchedTestRequest {
/// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum,
/// so that we don't keep the [`Timeline::gate`] open while the batch
/// is being built up inside the [`spsc_fold`] (pagestream pipelining).
#[derive(IntoStaticStr)]
enum BatchedFeMessage {
Exists {
span: Span,
@@ -644,10 +638,6 @@ enum BatchedFeMessage {
}
impl BatchedFeMessage {
fn as_static_str(&self) -> &'static str {
self.into()
}
fn observe_execution_start(&mut self, at: Instant) {
match self {
BatchedFeMessage::Exists { timer, .. }
@@ -1473,20 +1463,17 @@ impl PageServerHandler {
}
};
let result = warn_slow(
msg.as_static_str(),
WARN_SLOW_GETPAGE_THRESHOLD,
self.pagesteam_handle_batched_message(
let err = self
.pagesteam_handle_batched_message(
pgb_writer,
msg,
io_concurrency.clone(),
&cancel,
protocol_version,
ctx,
),
)
.await;
match result {
)
.await;
match err {
Ok(()) => {}
Err(e) => break e,
}
@@ -1649,17 +1636,13 @@ impl PageServerHandler {
return Err(e);
}
};
warn_slow(
batch.as_static_str(),
WARN_SLOW_GETPAGE_THRESHOLD,
self.pagesteam_handle_batched_message(
pgb_writer,
batch,
io_concurrency.clone(),
&cancel,
protocol_version,
&ctx,
),
self.pagesteam_handle_batched_message(
pgb_writer,
batch,
io_concurrency.clone(),
&cancel,
protocol_version,
&ctx,
)
.await?;
}

View File

@@ -693,15 +693,16 @@ impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
/// This is a conversion from our internal tenant config object to the one used
/// in external APIs.
impl From<TenantConfOpt> for models::TenantConfig {
// TODO(vlad): These are now the same, but they have different serialization logic.
// Can we merge them?
fn from(value: TenantConfOpt) -> Self {
fn humantime(d: Duration) -> String {
format!("{}s", d.as_secs())
}
Self {
checkpoint_distance: value.checkpoint_distance,
checkpoint_timeout: value.checkpoint_timeout,
checkpoint_timeout: value.checkpoint_timeout.map(humantime),
compaction_algorithm: value.compaction_algorithm,
compaction_target_size: value.compaction_target_size,
compaction_period: value.compaction_period,
compaction_period: value.compaction_period.map(humantime),
compaction_threshold: value.compaction_threshold,
compaction_upper_limit: value.compaction_upper_limit,
compaction_l0_first: value.compaction_l0_first,
@@ -710,23 +711,24 @@ impl From<TenantConfOpt> for models::TenantConfig {
l0_flush_stall_threshold: value.l0_flush_stall_threshold,
l0_flush_wait_upload: value.l0_flush_wait_upload,
gc_horizon: value.gc_horizon,
gc_period: value.gc_period,
gc_period: value.gc_period.map(humantime),
image_creation_threshold: value.image_creation_threshold,
pitr_interval: value.pitr_interval,
walreceiver_connect_timeout: value.walreceiver_connect_timeout,
lagging_wal_timeout: value.lagging_wal_timeout,
pitr_interval: value.pitr_interval.map(humantime),
walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime),
lagging_wal_timeout: value.lagging_wal_timeout.map(humantime),
max_lsn_wal_lag: value.max_lsn_wal_lag,
eviction_policy: value.eviction_policy,
min_resident_size_override: value.min_resident_size_override,
evictions_low_residence_duration_metric_threshold: value
.evictions_low_residence_duration_metric_threshold,
heatmap_period: value.heatmap_period,
.evictions_low_residence_duration_metric_threshold
.map(humantime),
heatmap_period: value.heatmap_period.map(humantime),
lazy_slru_download: value.lazy_slru_download,
timeline_get_throttle: value.timeline_get_throttle,
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
image_creation_preempt_threshold: value.image_creation_preempt_threshold,
lsn_lease_length: value.lsn_lease_length,
lsn_lease_length_for_ts: value.lsn_lease_length_for_ts,
lsn_lease_length: value.lsn_lease_length.map(humantime),
lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
timeline_offloading: value.timeline_offloading,
wal_receiver_protocol_override: value.wal_receiver_protocol_override,
rel_size_v2_enabled: value.rel_size_v2_enabled,
@@ -758,10 +760,29 @@ mod tests {
assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap());
}
#[test]
fn test_try_from_models_tenant_config_err() {
let tenant_config = models::TenantConfig {
lagging_wal_timeout: Some("5a".to_string()),
..TenantConfig::default()
};
let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config);
assert!(
tenant_conf_opt.is_err(),
"Suceeded to convert TenantConfig to TenantConfOpt"
);
let expected_error_str =
"lagging_wal_timeout: invalid value: string \"5a\", expected a duration";
assert_eq!(tenant_conf_opt.unwrap_err().to_string(), expected_error_str);
}
#[test]
fn test_try_from_models_tenant_config_success() {
let tenant_config = models::TenantConfig {
lagging_wal_timeout: Some(Duration::from_secs(5)),
lagging_wal_timeout: Some("5s".to_string()),
..TenantConfig::default()
};
@@ -772,4 +793,12 @@ mod tests {
Some(Duration::from_secs(5))
);
}
#[test]
fn test_vlad() {
let tenant_conf_opt: TenantConfOpt =
serde_json::from_str("{\"pitr_interval\": \"24h0m0s\"}").unwrap();
let tenant_config: models::TenantConfig = tenant_conf_opt.clone().into();
assert_eq!(tenant_config.pitr_interval.unwrap(), "24h0m0s");
}
}

View File

@@ -2874,7 +2874,6 @@ impl Timeline {
auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
availability_zone: self.conf.availability_zone.clone(),
ingest_batch_size: self.conf.ingest_batch_size,
validate_wal_contiguity: self.conf.validate_wal_contiguity,
},
broker_client,
ctx,

View File

@@ -2212,7 +2212,7 @@ impl Timeline {
let sub_compaction_max_job_size_mb =
sub_compaction_max_job_size_mb.unwrap_or(GC_COMPACT_MAX_SIZE_MB);
let mut compact_jobs = Vec::<GcCompactJob>::new();
let mut compact_jobs = Vec::new();
// For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
// by estimating the amount of files read for a compaction job. We should also partition on LSN.
let ((dense_ks, sparse_ks), _) = self.partitioning.read().as_ref().clone();
@@ -2299,25 +2299,16 @@ impl Timeline {
} else {
end
};
if total_size == 0 && !compact_jobs.is_empty() {
info!(
"splitting compaction job: {}..{}, estimated_size={}, extending the previous job",
start, end, total_size
);
compact_jobs.last_mut().unwrap().compact_key_range.end = end;
current_start = Some(end);
} else {
info!(
"splitting compaction job: {}..{}, estimated_size={}",
start, end, total_size
);
compact_jobs.push(GcCompactJob {
dry_run: job.dry_run,
compact_key_range: start..end,
compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn,
});
current_start = Some(end);
}
info!(
"splitting compaction job: {}..{}, estimated_size={}",
start, end, total_size
);
compact_jobs.push(GcCompactJob {
dry_run: job.dry_run,
compact_key_range: start..end,
compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn,
});
current_start = Some(end);
}
}
Ok(compact_jobs)

View File

@@ -56,7 +56,6 @@ pub struct WalReceiverConf {
pub auth_token: Option<Arc<String>>,
pub availability_zone: Option<String>,
pub ingest_batch_size: u64,
pub validate_wal_contiguity: bool,
}
pub struct WalReceiver {

View File

@@ -537,7 +537,6 @@ impl ConnectionManagerState {
let connect_timeout = self.conf.wal_connect_timeout;
let ingest_batch_size = self.conf.ingest_batch_size;
let protocol = self.conf.protocol;
let validate_wal_contiguity = self.conf.validate_wal_contiguity;
let timeline = Arc::clone(&self.timeline);
let ctx = ctx.detached_child(
TaskKind::WalReceiverConnectionHandler,
@@ -559,7 +558,6 @@ impl ConnectionManagerState {
ctx,
node_id,
ingest_batch_size,
validate_wal_contiguity,
)
.await;
@@ -1565,7 +1563,6 @@ mod tests {
auth_token: None,
availability_zone: None,
ingest_batch_size: 1,
validate_wal_contiguity: false,
},
wal_connection: None,
wal_stream_candidates: HashMap::new(),

View File

@@ -120,7 +120,6 @@ pub(super) async fn handle_walreceiver_connection(
ctx: RequestContext,
safekeeper_node: NodeId,
ingest_batch_size: u64,
validate_wal_contiguity: bool,
) -> Result<(), WalReceiverError> {
debug_assert_current_span_has_tenant_and_timeline_id();
@@ -275,7 +274,6 @@ pub(super) async fn handle_walreceiver_connection(
} => Some((format, compression)),
};
let mut expected_wal_start = startpoint;
while let Some(replication_message) = {
select! {
_ = cancellation.cancelled() => {
@@ -342,49 +340,13 @@ pub(super) async fn handle_walreceiver_connection(
)
})?;
// Guard against WAL gaps. If the start LSN of the PG WAL section
// from which the interpreted records were extracted, doesn't match
// the end of the previous batch (or the starting point for the first batch),
// then kill this WAL receiver connection and start a new one.
if validate_wal_contiguity {
if let Some(raw_wal_start_lsn) = batch.raw_wal_start_lsn {
match raw_wal_start_lsn.cmp(&expected_wal_start) {
std::cmp::Ordering::Greater => {
let msg = format!(
"Gap in streamed WAL: [{}, {})",
expected_wal_start, raw_wal_start_lsn
);
critical!("{msg}");
return Err(WalReceiverError::Other(anyhow!(msg)));
}
std::cmp::Ordering::Less => {
// Other shards are reading WAL behind us.
// This is valid, but check that we received records
// that we haven't seen before.
if let Some(first_rec) = batch.records.first() {
if first_rec.next_record_lsn < last_rec_lsn {
let msg = format!(
"Received record with next_record_lsn multiple times ({} < {})",
first_rec.next_record_lsn, expected_wal_start
);
critical!("{msg}");
return Err(WalReceiverError::Other(anyhow!(msg)));
}
}
}
std::cmp::Ordering::Equal => {}
}
}
}
let InterpretedWalRecords {
records,
next_record_lsn,
raw_wal_start_lsn: _,
} = batch;
tracing::debug!(
"Received WAL up to {} with next_record_lsn={}",
"Received WAL up to {} with next_record_lsn={:?}",
streaming_lsn,
next_record_lsn
);
@@ -461,11 +423,12 @@ pub(super) async fn handle_walreceiver_connection(
// need to advance last record LSN on all shards. If we've not ingested the latest
// record, then set the LSN of the modification past it. This way all shards
// advance their last record LSN at the same time.
let needs_last_record_lsn_advance = if next_record_lsn > modification.get_lsn() {
modification.set_lsn(next_record_lsn).unwrap();
true
} else {
false
let needs_last_record_lsn_advance = match next_record_lsn {
Some(lsn) if lsn > modification.get_lsn() => {
modification.set_lsn(lsn).unwrap();
true
}
_ => false,
};
if uncommitted_records > 0 || needs_last_record_lsn_advance {
@@ -483,8 +446,9 @@ pub(super) async fn handle_walreceiver_connection(
timeline.get_last_record_lsn()
);
last_rec_lsn = next_record_lsn;
expected_wal_start = streaming_lsn;
if let Some(lsn) = next_record_lsn {
last_rec_lsn = lsn;
}
Some(streaming_lsn)
}

View File

@@ -474,7 +474,8 @@ readahead_buffer_resize(int newsize, void *extra)
*/
if (MyPState->n_requests_inflight > newsize)
{
prefetch_wait_for(MyPState->ring_unused - newsize - 1);
Assert(MyPState->ring_unused >= MyPState->n_requests_inflight - newsize);
prefetch_wait_for(MyPState->ring_unused - (MyPState->n_requests_inflight - newsize));
Assert(MyPState->n_requests_inflight <= newsize);
}

View File

@@ -1,5 +1,5 @@
[toolchain]
channel = "1.85.0"
channel = "1.84.1"
profile = "default"
# The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
# https://rust-lang.github.io/rustup/concepts/profiles.html

View File

@@ -295,10 +295,6 @@ impl InterpretedWalReader {
let mut wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
// Tracks the start of the PG WAL LSN from which the current batch of
// interpreted records originated.
let mut current_batch_wal_start_lsn: Option<Lsn> = None;
loop {
tokio::select! {
// Main branch for reading WAL and forwarding it
@@ -306,7 +302,7 @@ impl InterpretedWalReader {
let wal = wal_or_reset.map(|wor| wor.get_wal().expect("reset handled in select branch below"));
let WalBytes {
wal,
wal_start_lsn,
wal_start_lsn: _,
wal_end_lsn,
available_wal_end_lsn,
} = match wal {
@@ -319,12 +315,6 @@ impl InterpretedWalReader {
}
};
// We will already have a value if the previous chunks of WAL
// did not decode into anything useful.
if current_batch_wal_start_lsn.is_none() {
current_batch_wal_start_lsn = Some(wal_start_lsn);
}
wal_decoder.feed_bytes(&wal);
// Deserialize and interpret WAL records from this batch of WAL.
@@ -373,9 +363,7 @@ impl InterpretedWalReader {
let max_next_record_lsn = match max_next_record_lsn {
Some(lsn) => lsn,
None => {
continue;
}
None => { continue; }
};
// Update the current position such that new receivers can decide
@@ -389,38 +377,21 @@ impl InterpretedWalReader {
}
}
let batch_wal_start_lsn = current_batch_wal_start_lsn.take().unwrap();
// Send interpreted records downstream. Anything that has already been seen
// by a shard is filtered out.
let mut shard_senders_to_remove = Vec::new();
for (shard, states) in &mut self.shard_senders {
for state in states {
let shard_sender_id = ShardSenderId::new(*shard, state.sender_id);
let batch = if max_next_record_lsn > state.next_record_lsn {
// This batch contains at least one record that this shard has not
// seen yet.
let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default();
InterpretedWalRecords {
records,
next_record_lsn: max_next_record_lsn,
raw_wal_start_lsn: Some(batch_wal_start_lsn),
}
} else if wal_end_lsn > state.next_record_lsn {
// All the records in this batch were seen by the shard
// However, the batch maps to a chunk of WAL that the
// shard has not yet seen. Notify it of the start LSN
// of the PG WAL chunk such that it doesn't look like a gap.
InterpretedWalRecords {
records: Vec::default(),
next_record_lsn: state.next_record_lsn,
raw_wal_start_lsn: Some(batch_wal_start_lsn),
}
} else {
// The shard has seen this chunk of WAL before. Skip it.
if max_next_record_lsn <= state.next_record_lsn {
continue;
}
let shard_sender_id = ShardSenderId::new(*shard, state.sender_id);
let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default();
let batch = InterpretedWalRecords {
records,
next_record_lsn: Some(max_next_record_lsn),
};
let res = state.tx.send(Batch {
@@ -432,7 +403,7 @@ impl InterpretedWalReader {
if res.is_err() {
shard_senders_to_remove.push(shard_sender_id);
} else {
state.next_record_lsn = std::cmp::max(state.next_record_lsn, max_next_record_lsn);
state.next_record_lsn = max_next_record_lsn;
}
}
}

View File

@@ -24,7 +24,6 @@ hex.workspace = true
hyper0.workspace = true
humantime.workspace = true
itertools.workspace = true
json-structural-diff.workspace = true
lasso.workspace = true
once_cell.workspace = true
pageserver_api.workspace = true

View File

@@ -1 +0,0 @@
ALTER TABLE nodes DROP listen_https_port;

View File

@@ -1 +0,0 @@
ALTER TABLE nodes ADD listen_https_port INTEGER;

View File

@@ -126,10 +126,6 @@ struct Cli {
#[arg(long)]
long_reconcile_threshold: Option<humantime::Duration>,
// Flag to use https for requests to pageserver API.
#[arg(long, default_value = "false")]
use_https_pageserver_api: bool,
}
enum StrictMode {
@@ -325,7 +321,6 @@ async fn async_main() -> anyhow::Result<()> {
address_for_peers: args.address_for_peers,
start_as_candidate: args.start_as_candidate,
http_service_port: args.listen.port() as i32,
use_https_pageserver_api: args.use_https_pageserver_api,
};
// Validate that we can connect to the database

View File

@@ -1,6 +1,5 @@
use std::{str::FromStr, time::Duration};
use anyhow::anyhow;
use pageserver_api::{
controller_api::{
AvailabilityZone, NodeAvailability, NodeDescribeResponse, NodeRegisterRequest,
@@ -33,16 +32,12 @@ pub(crate) struct Node {
listen_http_addr: String,
listen_http_port: u16,
listen_https_port: Option<u16>,
listen_pg_addr: String,
listen_pg_port: u16,
availability_zone_id: AvailabilityZone,
// Flag from storcon's config to use https for pageserver admin API.
// Invariant: if |true|, listen_https_port should contain a value.
use_https: bool,
// This cancellation token means "stop any RPCs in flight to this node, and don't start
// any more". It is not related to process shutdown.
#[serde(skip)]
@@ -61,16 +56,7 @@ pub(crate) enum AvailabilityTransition {
impl Node {
pub(crate) fn base_url(&self) -> String {
if self.use_https {
format!(
"https://{}:{}",
self.listen_http_addr,
self.listen_https_port
.expect("https port should be specified if use_https is on")
)
} else {
format!("http://{}:{}", self.listen_http_addr, self.listen_http_port)
}
format!("http://{}:{}", self.listen_http_addr, self.listen_http_port)
}
pub(crate) fn get_id(&self) -> NodeId {
@@ -96,20 +82,11 @@ impl Node {
self.id == register_req.node_id
&& self.listen_http_addr == register_req.listen_http_addr
&& self.listen_http_port == register_req.listen_http_port
// Note: listen_https_port may change. See [`Self::need_update`] for mode details.
// && self.listen_https_port == register_req.listen_https_port
&& self.listen_pg_addr == register_req.listen_pg_addr
&& self.listen_pg_port == register_req.listen_pg_port
&& self.availability_zone_id == register_req.availability_zone_id
}
// Do we need to update an existing record in DB on this registration request?
pub(crate) fn need_update(&self, register_req: &NodeRegisterRequest) -> bool {
// listen_https_port is checked here because it may change during migration to https.
// After migration, this check may be moved to registration_match.
self.listen_https_port != register_req.listen_https_port
}
/// For a shard located on this node, populate a response object
/// with this node's address information.
pub(crate) fn shard_location(&self, shard_id: TenantShardId) -> TenantLocateResponseShard {
@@ -118,7 +95,6 @@ impl Node {
node_id: self.id,
listen_http_addr: self.listen_http_addr.clone(),
listen_http_port: self.listen_http_port,
listen_https_port: self.listen_https_port,
listen_pg_addr: self.listen_pg_addr.clone(),
listen_pg_port: self.listen_pg_port,
}
@@ -199,34 +175,25 @@ impl Node {
}
}
#[allow(clippy::too_many_arguments)]
pub(crate) fn new(
id: NodeId,
listen_http_addr: String,
listen_http_port: u16,
listen_https_port: Option<u16>,
listen_pg_addr: String,
listen_pg_port: u16,
availability_zone_id: AvailabilityZone,
use_https: bool,
) -> anyhow::Result<Self> {
if use_https && listen_https_port.is_none() {
return Err(anyhow!("https is enabled, but node has no https port"));
}
Ok(Self {
) -> Self {
Self {
id,
listen_http_addr,
listen_http_port,
listen_https_port,
listen_pg_addr,
listen_pg_port,
scheduling: NodeSchedulingPolicy::Active,
availability: NodeAvailability::Offline,
availability_zone_id,
use_https,
cancel: CancellationToken::new(),
})
}
}
pub(crate) fn to_persistent(&self) -> NodePersistence {
@@ -235,19 +202,14 @@ impl Node {
scheduling_policy: self.scheduling.into(),
listen_http_addr: self.listen_http_addr.clone(),
listen_http_port: self.listen_http_port as i32,
listen_https_port: self.listen_https_port.map(|x| x as i32),
listen_pg_addr: self.listen_pg_addr.clone(),
listen_pg_port: self.listen_pg_port as i32,
availability_zone_id: self.availability_zone_id.0.clone(),
}
}
pub(crate) fn from_persistent(np: NodePersistence, use_https: bool) -> anyhow::Result<Self> {
if use_https && np.listen_https_port.is_none() {
return Err(anyhow!("https is enabled, but node has no https port"));
}
Ok(Self {
pub(crate) fn from_persistent(np: NodePersistence) -> Self {
Self {
id: NodeId(np.node_id as u64),
// At startup we consider a node offline until proven otherwise.
availability: NodeAvailability::Offline,
@@ -255,13 +217,11 @@ impl Node {
.expect("Bad scheduling policy in DB"),
listen_http_addr: np.listen_http_addr,
listen_http_port: np.listen_http_port as u16,
listen_https_port: np.listen_https_port.map(|x| x as u16),
listen_pg_addr: np.listen_pg_addr,
listen_pg_port: np.listen_pg_port as u16,
availability_zone_id: AvailabilityZone(np.availability_zone_id),
use_https,
cancel: CancellationToken::new(),
})
}
}
/// Wrapper for issuing requests to pageserver management API: takes care of generic
@@ -325,9 +285,8 @@ impl Node {
warn_threshold,
max_retries,
&format!(
"Call to node {} ({}) management API",
self.id,
self.base_url(),
"Call to node {} ({}:{}) management API",
self.id, self.listen_http_addr, self.listen_http_port
),
cancel,
)
@@ -343,7 +302,6 @@ impl Node {
availability_zone_id: self.availability_zone_id.0.clone(),
listen_http_addr: self.listen_http_addr.clone(),
listen_http_port: self.listen_http_port,
listen_https_port: self.listen_https_port,
listen_pg_addr: self.listen_pg_addr.clone(),
listen_pg_port: self.listen_pg_port,
}

View File

@@ -375,23 +375,18 @@ impl Persistence {
Ok(nodes)
}
pub(crate) async fn update_node<V>(
pub(crate) async fn update_node(
&self,
input_node_id: NodeId,
values: V,
) -> DatabaseResult<()>
where
V: diesel::AsChangeset<Target = crate::schema::nodes::table> + Clone + Send + Sync,
V::Changeset: diesel::query_builder::QueryFragment<diesel::pg::Pg> + Send, // valid Postgres SQL
{
input_scheduling: NodeSchedulingPolicy,
) -> DatabaseResult<()> {
use crate::schema::nodes::dsl::*;
let updated = self
.with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
let values = values.clone();
Box::pin(async move {
let updated = diesel::update(nodes)
.filter(node_id.eq(input_node_id.0 as i64))
.set(values)
.set((scheduling_policy.eq(String::from(input_scheduling)),))
.execute(conn)
.await?;
Ok(updated)
@@ -408,32 +403,6 @@ impl Persistence {
}
}
pub(crate) async fn update_node_scheduling_policy(
&self,
input_node_id: NodeId,
input_scheduling: NodeSchedulingPolicy,
) -> DatabaseResult<()> {
use crate::schema::nodes::dsl::*;
self.update_node(
input_node_id,
scheduling_policy.eq(String::from(input_scheduling)),
)
.await
}
pub(crate) async fn update_node_on_registration(
&self,
input_node_id: NodeId,
input_https_port: Option<u16>,
) -> DatabaseResult<()> {
use crate::schema::nodes::dsl::*;
self.update_node(
input_node_id,
listen_https_port.eq(input_https_port.map(|x| x as i32)),
)
.await
}
/// At startup, load the high level state for shards, such as their config + policy. This will
/// be enriched at runtime with state discovered on pageservers.
///
@@ -1483,7 +1452,6 @@ pub(crate) struct NodePersistence {
pub(crate) listen_pg_addr: String,
pub(crate) listen_pg_port: i32,
pub(crate) availability_zone_id: String,
pub(crate) listen_https_port: Option<i32>,
}
/// Tenant metadata health status that are stored durably.

View File

@@ -1,7 +1,6 @@
use crate::pageserver_client::PageserverClient;
use crate::persistence::Persistence;
use crate::{compute_hook, service};
use json_structural_diff::JsonDiff;
use pageserver_api::controller_api::{AvailabilityZone, MigrationConfig, PlacementPolicy};
use pageserver_api::models::{
LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, TenantWaitLsnRequest,
@@ -25,7 +24,7 @@ use crate::compute_hook::{ComputeHook, NotifyError};
use crate::node::Node;
use crate::tenant_shard::{IntentState, ObservedState, ObservedStateDelta, ObservedStateLocation};
const DEFAULT_HEATMAP_PERIOD: Duration = Duration::from_secs(60);
const DEFAULT_HEATMAP_PERIOD: &str = "60s";
/// Object with the lifetime of the background reconcile task that is created
/// for tenants which have a difference between their intent and observed states.
@@ -881,27 +880,7 @@ impl Reconciler {
self.generation = Some(generation);
wanted_conf.generation = generation.into();
}
let diff = match observed {
Some(ObservedStateLocation {
conf: Some(observed),
}) => {
let diff = JsonDiff::diff(
&serde_json::to_value(observed.clone()).unwrap(),
&serde_json::to_value(wanted_conf.clone()).unwrap(),
false,
);
if let Some(json_diff) = diff.diff {
serde_json::to_string(&json_diff).unwrap_or("diff err".to_string())
} else {
"unknown".to_string()
}
}
_ => "full".to_string(),
};
tracing::info!(node_id=%node.get_id(), "Observed configuration requires update: {diff}");
tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
// Because `node` comes from a ref to &self, clone it before calling into a &mut self
// function: this could be avoided by refactoring the state mutated by location_config into
@@ -1201,7 +1180,7 @@ fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig
let mut config = config.clone();
if has_secondaries {
if config.heatmap_period.is_none() {
config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD);
config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string());
}
} else {
config.heatmap_period = None;

View File

@@ -930,16 +930,13 @@ pub(crate) mod test_utils {
NodeId(i),
format!("httphost-{i}"),
80 + i as u16,
None,
format!("pghost-{i}"),
5432 + i as u16,
az_iter
.next()
.cloned()
.unwrap_or(AvailabilityZone("test-az".to_string())),
false,
)
.unwrap();
);
node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0)));
assert!(node.is_available());
node

View File

@@ -26,7 +26,6 @@ diesel::table! {
listen_pg_addr -> Varchar,
listen_pg_port -> Int4,
availability_zone_id -> Varchar,
listen_https_port -> Nullable<Int4>,
}
}

View File

@@ -399,8 +399,6 @@ pub struct Config {
pub http_service_port: i32,
pub long_reconcile_threshold: Duration,
pub use_https_pageserver_api: bool,
}
impl From<DatabaseError> for ApiError {
@@ -1403,8 +1401,8 @@ impl Service {
.list_nodes()
.await?
.into_iter()
.map(|x| Node::from_persistent(x, config.use_https_pageserver_api))
.collect::<anyhow::Result<Vec<Node>>>()?;
.map(Node::from_persistent)
.collect::<Vec<_>>();
let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.get_id(), n)).collect();
tracing::info!("Loaded {} nodes from database.", nodes.len());
metrics::METRICS_REGISTRY
@@ -1503,13 +1501,10 @@ impl Service {
NodeId(node_id as u64),
"".to_string(),
123,
None,
"".to_string(),
123,
AvailabilityZone("test_az".to_string()),
false,
)
.unwrap();
);
scheduler.node_upsert(&node);
}
@@ -2921,9 +2916,7 @@ impl Service {
first
};
let updated_config = base
.apply_patch(patch)
.map_err(|err| ApiError::BadRequest(anyhow::anyhow!(err)))?;
let updated_config = base.apply_patch(patch);
self.set_tenant_config_and_reconcile(tenant_id, updated_config)
.await
}
@@ -5914,10 +5907,8 @@ impl Service {
)
.await;
#[derive(PartialEq)]
enum RegistrationStatus {
UpToDate,
NeedUpdate,
Matched,
Mismatched,
New,
}
@@ -5926,11 +5917,7 @@ impl Service {
let locked = self.inner.read().unwrap();
if let Some(node) = locked.nodes.get(&register_req.node_id) {
if node.registration_match(&register_req) {
if node.need_update(&register_req) {
RegistrationStatus::NeedUpdate
} else {
RegistrationStatus::UpToDate
}
RegistrationStatus::Matched
} else {
RegistrationStatus::Mismatched
}
@@ -5940,9 +5927,9 @@ impl Service {
};
match registration_status {
RegistrationStatus::UpToDate => {
RegistrationStatus::Matched => {
tracing::info!(
"Node {} re-registered with matching address and is up to date",
"Node {} re-registered with matching address",
register_req.node_id
);
@@ -5960,7 +5947,7 @@ impl Service {
"Node is already registered with different address".to_string(),
));
}
RegistrationStatus::New | RegistrationStatus::NeedUpdate => {
RegistrationStatus::New => {
// fallthrough
}
}
@@ -5989,16 +5976,6 @@ impl Service {
));
}
if self.config.use_https_pageserver_api && register_req.listen_https_port.is_none() {
return Err(ApiError::PreconditionFailed(
format!(
"Node {} has no https port, but use_https is enabled",
register_req.node_id
)
.into(),
));
}
// Ordering: we must persist the new node _before_ adding it to in-memory state.
// This ensures that before we use it for anything or expose it via any external
// API, it is guaranteed to be available after a restart.
@@ -6006,29 +5983,13 @@ impl Service {
register_req.node_id,
register_req.listen_http_addr,
register_req.listen_http_port,
register_req.listen_https_port,
register_req.listen_pg_addr,
register_req.listen_pg_port,
register_req.availability_zone_id.clone(),
self.config.use_https_pageserver_api,
);
let new_node = match new_node {
Ok(new_node) => new_node,
Err(error) => return Err(ApiError::InternalServerError(error)),
};
match registration_status {
RegistrationStatus::New => self.persistence.insert_node(&new_node).await?,
RegistrationStatus::NeedUpdate => {
self.persistence
.update_node_on_registration(
register_req.node_id,
register_req.listen_https_port,
)
.await?
}
_ => unreachable!("Other statuses have been processed earlier"),
}
// TODO: idempotency if the node already exists in the database
self.persistence.insert_node(&new_node).await?;
let mut locked = self.inner.write().unwrap();
let mut new_nodes = (*locked.nodes).clone();
@@ -6043,24 +6004,12 @@ impl Service {
.storage_controller_pageserver_nodes
.set(locked.nodes.len() as i64);
match registration_status {
RegistrationStatus::New => {
tracing::info!(
"Registered pageserver {} ({}), now have {} pageservers",
register_req.node_id,
register_req.availability_zone_id,
locked.nodes.len()
);
}
RegistrationStatus::NeedUpdate => {
tracing::info!(
"Re-registered and updated node {} ({})",
register_req.node_id,
register_req.availability_zone_id,
);
}
_ => unreachable!("Other statuses have been processed earlier"),
}
tracing::info!(
"Registered pageserver {} ({}), now have {} pageservers",
register_req.node_id,
register_req.availability_zone_id,
locked.nodes.len()
);
Ok(())
}
@@ -6078,9 +6027,7 @@ impl Service {
if let Some(scheduling) = scheduling {
// Scheduling is a persistent part of Node: we must write updates to the database before
// applying them in memory
self.persistence
.update_node_scheduling_policy(node_id, scheduling)
.await?;
self.persistence.update_node(node_id, scheduling).await?;
}
// If we're activating a node, then before setting it active we must reconcile any shard locations
@@ -6651,12 +6598,11 @@ impl Service {
) -> Option<ReconcilerWaiter> {
let reconcile_needed = shard.get_reconcile_needed(nodes);
let reconcile_reason = match reconcile_needed {
match reconcile_needed {
ReconcileNeeded::No => return None,
ReconcileNeeded::WaitExisting(waiter) => return Some(waiter),
ReconcileNeeded::Yes(reason) => {
ReconcileNeeded::Yes => {
// Fall through to try and acquire units for spawning reconciler
reason
}
};
@@ -6695,7 +6641,6 @@ impl Service {
};
shard.spawn_reconciler(
reconcile_reason,
&self.result_tx,
nodes,
&self.compute_hook,

View File

@@ -481,14 +481,7 @@ pub(crate) enum ReconcileNeeded {
/// spawned: wait for the existing reconciler rather than spawning a new one.
WaitExisting(ReconcilerWaiter),
/// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`]
Yes(ReconcileReason),
}
#[derive(Debug)]
pub(crate) enum ReconcileReason {
ActiveNodesDirty,
UnknownLocation,
PendingComputeNotification,
Yes,
}
/// Pending modification to the observed state of a tenant shard.
@@ -1294,7 +1287,13 @@ impl TenantShard {
attached_location_conf(generation, &self.shard, &self.config, &self.policy);
match self.observed.locations.get(&node_id) {
Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
Some(_) | None => {
Some(conf) => {
tracing::info!("Wanted: {wanted_conf:?}");
tracing::info!("Observed: {conf:?}");
dirty_nodes.insert(node_id);
}
None => {
dirty_nodes.insert(node_id);
}
}
@@ -1348,18 +1347,12 @@ impl TenantShard {
let active_nodes_dirty = self.dirty(pageservers);
let reconcile_needed = match (
active_nodes_dirty,
dirty_observed,
self.pending_compute_notification,
) {
(true, _, _) => ReconcileNeeded::Yes(ReconcileReason::ActiveNodesDirty),
(_, true, _) => ReconcileNeeded::Yes(ReconcileReason::UnknownLocation),
(_, _, true) => ReconcileNeeded::Yes(ReconcileReason::PendingComputeNotification),
_ => ReconcileNeeded::No,
};
// Even if there is no pageserver work to be done, if we have a pending notification to computes,
// wake up a reconciler to send it.
let do_reconcile =
active_nodes_dirty || dirty_observed || self.pending_compute_notification;
if matches!(reconcile_needed, ReconcileNeeded::No) {
if !do_reconcile {
tracing::debug!("Not dirty, no reconciliation needed.");
return ReconcileNeeded::No;
}
@@ -1402,7 +1395,7 @@ impl TenantShard {
}
}
reconcile_needed
ReconcileNeeded::Yes
}
/// Ensure the sequence number is set to a value where waiting for this value will make us wait
@@ -1492,7 +1485,6 @@ impl TenantShard {
#[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
pub(crate) fn spawn_reconciler(
&mut self,
reason: ReconcileReason,
result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResultRequest>,
pageservers: &Arc<HashMap<NodeId, Node>>,
compute_hook: &Arc<ComputeHook>,
@@ -1552,7 +1544,7 @@ impl TenantShard {
let reconcile_seq = self.sequence;
let long_reconcile_threshold = service_config.long_reconcile_threshold;
tracing::info!(seq=%reconcile_seq, "Spawning Reconciler ({reason:?})");
tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence);
let must_notify = self.pending_compute_notification;
let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
tenant_id=%reconciler.tenant_shard_id.tenant_id,

View File

@@ -1167,15 +1167,15 @@ class NeonEnv:
"max_batch_size": 32,
}
# Concurrent IO (https://github.com/neondatabase/neon/issues/9378):
# enable concurrent IO by default in tests and benchmarks.
# Compat tests are exempt because old versions fail to parse the new config.
get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io
if config.test_may_use_compatibility_snapshot_binaries:
log.info(
"Skipping WAL contiguity validation to avoid forward-compatibility related test failures"
"Forcing use of binary-built-in default to avoid forward-compatibility related test failures"
)
else:
# Look for gaps in WAL received from safekeepeers
ps_cfg["validate_wal_contiguity"] = True
get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io
get_vectored_concurrent_io = None
if get_vectored_concurrent_io is not None:
ps_cfg["get_vectored_concurrent_io"] = {
"mode": self.pageserver_get_vectored_concurrent_io,
@@ -1630,7 +1630,6 @@ def neon_env_builder(
class PageserverPort:
pg: int
http: int
https: int | None = None
class LogUtils:
@@ -1887,7 +1886,6 @@ class NeonStorageController(MetricsGetter, LogUtils):
"node_id": int(node.id),
"listen_http_addr": "localhost",
"listen_http_port": node.service_port.http,
"listen_https_port": node.service_port.https,
"listen_pg_addr": "localhost",
"listen_pg_port": node.service_port.pg,
"availability_zone_id": node.az_id,

View File

@@ -3766,66 +3766,7 @@ def test_storage_controller_node_flap_detach_race(
wait_until(validate_locations, timeout=10)
def test_update_node_on_registration(neon_env_builder: NeonEnvBuilder):
"""
Check that storage controller handles node_register requests with updated fields correctly.
1. Run storage controller and register 1 pageserver without https port.
2. Register the same pageserver with https port. Check that port has been updated.
3. Restart the storage controller. Check that https port is persistent.
4. Register the same pageserver without https port again (rollback). Check that port has been removed.
"""
neon_env_builder.num_pageservers = 1
env = neon_env_builder.init_configs()
env.storage_controller.start()
env.storage_controller.wait_until_ready()
pageserver = env.pageservers[0]
# Step 1. Register pageserver without https port.
env.storage_controller.node_register(pageserver)
env.storage_controller.consistency_check()
nodes = env.storage_controller.node_list()
assert len(nodes) == 1
assert nodes[0]["listen_https_port"] is None
# Step 2. Register pageserver with https port.
pageserver.service_port.https = 1234
env.storage_controller.node_register(pageserver)
env.storage_controller.consistency_check()
nodes = env.storage_controller.node_list()
assert len(nodes) == 1
assert nodes[0]["listen_https_port"] == 1234
# Step 3. Restart storage controller.
env.storage_controller.stop()
env.storage_controller.start()
env.storage_controller.wait_until_ready()
env.storage_controller.consistency_check()
nodes = env.storage_controller.node_list()
assert len(nodes) == 1
assert nodes[0]["listen_https_port"] == 1234
# Step 4. Register pageserver with no https port again.
pageserver.service_port.https = None
env.storage_controller.node_register(pageserver)
env.storage_controller.consistency_check()
nodes = env.storage_controller.node_list()
assert len(nodes) == 1
assert nodes[0]["listen_https_port"] is None
def test_storage_controller_location_conf_equivalence(neon_env_builder: NeonEnvBuilder):
"""
Validate that a storage controller restart with no shards in a transient state
performs zero reconciliations at start-up. Implicitly, this means that the location
configs returned by the pageserver are identical to the persisted state in the
storage controller database.
"""
def test_storage_controller_config_equivalence(neon_env_builder: NeonEnvBuilder):
neon_env_builder.num_pageservers = 1
neon_env_builder.storage_controller_config = {
"start_as_candidate": False,

View File

@@ -1,10 +1,9 @@
from __future__ import annotations
import threading
import time
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, logical_replication_sync
from fixtures.neon_fixtures import NeonEnv
from fixtures.utils import query_scalar, wait_until
@@ -240,173 +239,3 @@ def test_subscriber_branching(neon_simple_env: NeonEnv):
res = scur_postgres.fetchall()
assert len(res) == 1
assert str(sub_child_2_timeline_id) == res[0][0]
def test_multiple_subscription_branching(neon_simple_env: NeonEnv):
"""
Test that compute_ctl can handle concurrent deletion of subscriptions in a multiple databases
"""
env = neon_simple_env
NUMBER_OF_DBS = 5
# Create and start endpoint so that neon_local put all the generated
# stuff into the spec.json file.
endpoint = env.endpoints.create_start(
"main",
config_lines=[
"max_replication_slots = 10",
"max_logical_replication_workers=10",
"max_worker_processes=10",
],
)
TEST_DB_NAMES = [
{
"name": "neondb",
"owner": "cloud_admin",
},
{
"name": "publisher_db",
"owner": "cloud_admin",
},
]
for i in range(NUMBER_OF_DBS):
TEST_DB_NAMES.append(
{
"name": f"db{i}",
"owner": "cloud_admin",
}
)
# Update the spec.json file to create the databases
# and reconfigure the endpoint to apply the changes.
endpoint.respec_deep(
**{
"skip_pg_catalog_updates": False,
"cluster": {
"databases": TEST_DB_NAMES,
},
}
)
endpoint.reconfigure()
connstr = endpoint.connstr(dbname="publisher_db").replace("'", "''")
# create table, replication and subscription for each of the databases
with endpoint.cursor(dbname="publisher_db") as publisher_cursor:
for i in range(NUMBER_OF_DBS):
publisher_cursor.execute(f"CREATE TABLE t{i}(a int)")
publisher_cursor.execute(f"CREATE PUBLICATION mypub{i} FOR TABLE t{i}")
publisher_cursor.execute(
f"select pg_catalog.pg_create_logical_replication_slot('mysub{i}', 'pgoutput');"
)
publisher_cursor.execute(f"INSERT INTO t{i} VALUES ({i})")
with endpoint.cursor(dbname=f"db{i}") as cursor:
cursor.execute(f"CREATE TABLE t{i}(a int)")
cursor.execute(
f"CREATE SUBSCRIPTION mysub{i} CONNECTION '{connstr}' PUBLICATION mypub{i} WITH (create_slot = false) "
)
# wait for the subscription to be active
for i in range(NUMBER_OF_DBS):
logical_replication_sync(
endpoint,
endpoint,
f"mysub{i}",
sub_dbname=f"db{i}",
pub_dbname="publisher_db",
)
# Check that replication is working
for i in range(NUMBER_OF_DBS):
with endpoint.cursor(dbname=f"db{i}") as cursor:
cursor.execute(f"SELECT * FROM t{i}")
rows = cursor.fetchall()
assert len(rows) == 1
assert rows[0][0] == i
last_insert_lsn = query_scalar(cursor, "select pg_current_wal_insert_lsn();")
def start_publisher_workload(table_num: int, duration: int):
start = time.time()
with endpoint.cursor(dbname="publisher_db") as cur:
while time.time() - start < duration:
cur.execute(f"INSERT INTO t{i} SELECT FROM generate_series(1,1000)")
LOAD_DURATION = 5
threads = [
threading.Thread(target=start_publisher_workload, args=(i, LOAD_DURATION))
for i in range(NUMBER_OF_DBS)
]
for thread in threads:
thread.start()
sub_child_1_timeline_id = env.create_branch(
"subscriber_child_1",
ancestor_branch_name="main",
ancestor_start_lsn=last_insert_lsn,
)
sub_child_1 = env.endpoints.create("subscriber_child_1")
sub_child_1.respec(
skip_pg_catalog_updates=False,
reconfigure_concurrency=5,
drop_subscriptions_before_start=True,
cluster={
"databases": TEST_DB_NAMES,
"roles": [],
},
)
sub_child_1.start()
# ensure that subscription deletion happened on this timeline
with sub_child_1.cursor() as scur_postgres:
scur_postgres.execute("SELECT timeline_id from neon.drop_subscriptions_done")
res = scur_postgres.fetchall()
log.info(f"res = {res}")
assert len(res) == 1
assert str(sub_child_1_timeline_id) == res[0][0]
# ensure that there are no subscriptions in the databases
for i in range(NUMBER_OF_DBS):
with sub_child_1.cursor(dbname=f"db{i}") as cursor:
cursor.execute("SELECT * FROM pg_catalog.pg_subscription")
res = cursor.fetchall()
assert len(res) == 0
# ensure that there are no unexpected rows in the tables
cursor.execute(f"SELECT * FROM t{i}")
rows = cursor.fetchall()
assert len(rows) == 1
assert rows[0][0] == i
for thread in threads:
thread.join()
# ensure that logical replication is still working in main endpoint
# wait for it to catch up
for i in range(NUMBER_OF_DBS):
logical_replication_sync(
endpoint,
endpoint,
f"mysub{i}",
sub_dbname=f"db{i}",
pub_dbname="publisher_db",
)
# verify that the data is the same in publisher and subscriber tables
with endpoint.cursor(dbname="publisher_db") as publisher_cursor:
for i in range(NUMBER_OF_DBS):
with endpoint.cursor(dbname=f"db{i}") as cursor:
publisher_cursor.execute(f"SELECT count(*) FROM t{i}")
cursor.execute(f"SELECT count(*) FROM t{i}")
pub_res = publisher_cursor.fetchone()
sub_res = cursor.fetchone()
log.info(f"for table t{i}: pub_res = {pub_res}, sub_res = {sub_res}")
assert pub_res == sub_res

View File

@@ -5,11 +5,11 @@
],
"v16": [
"16.8",
"261ed10e9b8c8dda01ad7aefb18e944e30aa161d"
"6cb8d22079570b50fcaff29124d40807c1e63a82"
],
"v15": [
"15.12",
"6ff50443773b69749e16da6db9d4f4b19064b4b7"
"023f1020ecb07af3bb0ddbf4622e1a3c3fa276a4"
],
"v14": [
"14.17",