diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 94f768719f..456c7b8c92 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -69,7 +69,7 @@ jobs: submodules: true - name: Check for file changes - uses: step-security/paths-filter@v3 + uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 # v3.0.2 id: files-changed with: token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 3427a0eb49..3e81183687 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -53,7 +53,7 @@ jobs: submodules: true - name: Check for Postgres changes - uses: step-security/paths-filter@v3 + uses: dorny/paths-filter@1441771bbfdd59dcd748680ee64ebd8faab1a242 #v3 id: files_changed with: token: ${{ github.token }} diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml index 6fb4753fc0..23b8573097 100644 --- a/.github/workflows/pre-merge-checks.yml +++ b/.github/workflows/pre-merge-checks.yml @@ -34,7 +34,7 @@ jobs: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1 + - uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5 id: python-src with: files: | @@ -45,7 +45,7 @@ jobs: poetry.lock pyproject.toml - - uses: step-security/changed-files@3dbe17c78367e7d60f00d78ae6781a35be47b4a1 # v45.0.1 + - uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5 id: rust-src with: files: | diff --git a/.github/workflows/proxy-benchmark.yml b/.github/workflows/proxy-benchmark.yml index 75ecacaced..3a98ad4e8e 100644 --- a/.github/workflows/proxy-benchmark.yml +++ b/.github/workflows/proxy-benchmark.yml @@ -60,22 +60,23 @@ jobs: } >> "$GITHUB_ENV" - name: Run proxy-bench - run: ./${PROXY_BENCH_PATH}/run.sh + run: ${PROXY_BENCH_PATH}/run.sh - name: Ingest Bench Results # neon repo script - if: success() + if: always() run: | mkdir -p $TEST_OUTPUT python $NEON_DIR/scripts/proxy_bench_results_ingest.py --out $TEST_OUTPUT - name: Push Metrics to Proxy perf database - if: success() + if: always() env: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PROXY_TEST_RESULT_CONNSTR }}" REPORT_FROM: $TEST_OUTPUT run: $NEON_DIR/scripts/generate_and_push_perf_report.sh - name: Docker cleanup + if: always() run: docker compose down - name: Notify Failure diff --git a/.gitignore b/.gitignore index 70c7e96303..6574d7b9de 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ /tmp_check_cli __pycache__/ test_output/ +neon_previous/ .vscode .idea *.swp diff --git a/Cargo.lock b/Cargo.lock index 71e78243a6..4c9cfa97e1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1279,6 +1279,7 @@ dependencies = [ "remote_storage", "serde", "serde_json", + "url", "utils", ] @@ -1304,6 +1305,7 @@ dependencies = [ "fail", "flate2", "futures", + "hostname-validator", "http 1.1.0", "indexmap 2.9.0", "itertools 0.10.5", @@ -2770,6 +2772,12 @@ dependencies = [ "windows", ] +[[package]] +name = "hostname-validator" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f558a64ac9af88b5ba400d99b579451af0d39c6d360980045b91aac966d705e2" + [[package]] name = "http" version = "0.2.9" @@ -4421,6 +4429,7 @@ dependencies = [ "strum", "strum_macros", "thiserror 1.0.69", + "tracing", "tracing-utils", "utils", ] @@ -4479,6 +4488,7 @@ dependencies = [ "pageserver_api", "postgres_ffi_types", "prost 0.13.5", + "prost-types 0.13.5", "strum", "strum_macros", "thiserror 1.0.69", @@ -5156,7 +5166,7 @@ dependencies = [ "petgraph", "prettyplease", "prost 0.13.5", - "prost-types 0.13.3", + "prost-types 0.13.5", "regex", "syn 2.0.100", "tempfile", @@ -5199,9 +5209,9 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.13.3" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670" +checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" dependencies = [ "prost 0.13.5", ] @@ -6808,6 +6818,7 @@ dependencies = [ "chrono", "clap", "clashmap", + "compute_api", "control_plane", "cron", "diesel", @@ -7641,7 +7652,7 @@ dependencies = [ "prettyplease", "proc-macro2", "prost-build 0.13.3", - "prost-types 0.13.3", + "prost-types 0.13.5", "quote", "syn 2.0.100", ] @@ -7653,7 +7664,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9687bd5bfeafebdded2356950f278bba8226f0b32109537c4253406e09aafe1" dependencies = [ "prost 0.13.5", - "prost-types 0.13.3", + "prost-types 0.13.5", "tokio", "tokio-stream", "tonic 0.13.1", diff --git a/Cargo.toml b/Cargo.toml index aeb7976b6c..7728f6d8fe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -152,6 +152,7 @@ pprof = { version = "0.14", features = ["criterion", "flamegraph", "frame-pointe procfs = "0.16" prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency prost = "0.13.5" +prost-types = "0.13.5" rand = "0.8" redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] } regex = "1.10.2" diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index bce2a28b8b..111e64d5d1 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1983,7 +1983,7 @@ RUN apt update && \ locales \ lsof \ procps \ - rsyslog \ + rsyslog-gnutls \ screen \ tcpdump \ $VERSION_INSTALLS && \ diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 0a071c1ad1..1a03022d89 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -27,6 +27,7 @@ fail.workspace = true flate2.workspace = true futures.workspace = true http.workspace = true +hostname-validator = "1.1" indexmap.workspace = true itertools.workspace = true jsonwebtoken.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index d7ff381f1b..db7746b8eb 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -36,6 +36,8 @@ use std::ffi::OsString; use std::fs::File; use std::process::exit; +use std::sync::Arc; +use std::sync::atomic::AtomicU64; use std::sync::mpsc; use std::thread; use std::time::Duration; @@ -190,7 +192,9 @@ fn main() -> Result<()> { cgroup: cli.cgroup, #[cfg(target_os = "linux")] vm_monitor_addr: cli.vm_monitor_addr, - installed_extensions_collection_interval: cli.installed_extensions_collection_interval, + installed_extensions_collection_interval: Arc::new(AtomicU64::new( + cli.installed_extensions_collection_interval, + )), }, config, )?; diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 7566626d57..fae76579d8 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1,4 +1,4 @@ -use anyhow::{Context, Result, anyhow}; +use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; use compute_api::privilege::Privilege; use compute_api::responses::{ @@ -6,7 +6,7 @@ use compute_api::responses::{ LfcPrewarmState, TlsConfig, }; use compute_api::spec::{ - ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent, + ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent, }; use futures::StreamExt; use futures::future::join_all; @@ -25,7 +25,7 @@ use std::os::unix::fs::{PermissionsExt, symlink}; use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; -use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::{Arc, Condvar, Mutex, RwLock}; use std::time::{Duration, Instant}; use std::{env, fs}; @@ -70,6 +70,7 @@ pub static BUILD_TAG: Lazy = Lazy::new(|| { .unwrap_or(BUILD_TAG_DEFAULT) .to_string() }); +const DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL: u64 = 3600; /// Static configuration params that don't change after startup. These mostly /// come from the CLI args, or are derived from them. @@ -103,7 +104,7 @@ pub struct ComputeNodeParams { pub remote_ext_base_url: Option, /// Interval for installed extensions collection - pub installed_extensions_collection_interval: u64, + pub installed_extensions_collection_interval: Arc, } /// Compute node info shared across several `compute_ctl` threads. @@ -126,6 +127,9 @@ pub struct ComputeNode { // key: ext_archive_name, value: started download time, download_completed? pub ext_download_progress: RwLock, bool)>>, pub compute_ctl_config: ComputeCtlConfig, + + /// Handle to the extension stats collection task + extension_stats_task: Mutex>>, } // store some metrics about download size that might impact startup time @@ -428,6 +432,7 @@ impl ComputeNode { state_changed: Condvar::new(), ext_download_progress: RwLock::new(HashMap::new()), compute_ctl_config: config.compute_ctl_config, + extension_stats_task: Mutex::new(None), }) } @@ -515,6 +520,9 @@ impl ComputeNode { None }; + // Terminate the extension stats collection task + this.terminate_extension_stats_task(); + // Terminate the vm_monitor so it releases the file watcher on // /sys/fs/cgroup/neon-postgres. // Note: the vm-monitor only runs on linux because it requires cgroups. @@ -751,10 +759,15 @@ impl ComputeNode { // Configure and start rsyslog for compliance audit logging match pspec.spec.audit_log_level { ComputeAudit::Hipaa | ComputeAudit::Extended | ComputeAudit::Full => { - let remote_endpoint = + let remote_tls_endpoint = + std::env::var("AUDIT_LOGGING_TLS_ENDPOINT").unwrap_or("".to_string()); + let remote_plain_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string()); - if remote_endpoint.is_empty() { - anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty"); + + if remote_plain_endpoint.is_empty() && remote_tls_endpoint.is_empty() { + anyhow::bail!( + "AUDIT_LOGGING_ENDPOINT and AUDIT_LOGGING_TLS_ENDPOINT are both empty" + ); } let log_directory_path = Path::new(&self.params.pgdata).join("log"); @@ -770,7 +783,8 @@ impl ComputeNode { log_directory_path.clone(), endpoint_id, project_id, - &remote_endpoint, + &remote_plain_endpoint, + &remote_tls_endpoint, )?; // Launch a background task to clean up the audit logs @@ -1003,19 +1017,12 @@ impl ComputeNode { fn try_get_basebackup(&self, compute_state: &ComputeState, lsn: Lsn) -> Result<()> { let spec = compute_state.pspec.as_ref().expect("spec must be set"); - // Detect the protocol scheme. If the URL doesn't have a scheme, assume libpq. let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap(); - let scheme = match Url::parse(shard0_connstr) { - Ok(url) => url.scheme().to_lowercase().to_string(), - Err(url::ParseError::RelativeUrlWithoutBase) => "postgresql".to_string(), - Err(err) => return Err(anyhow!("invalid connstring URL: {err}")), - }; - let started = Instant::now(); - let (connected, size) = match scheme.as_str() { - "postgresql" | "postgres" => self.try_get_basebackup_libpq(spec, lsn)?, - "grpc" => self.try_get_basebackup_grpc(spec, lsn)?, - scheme => return Err(anyhow!("unknown URL scheme {scheme}")), + + let (connected, size) = match PageserverProtocol::from_connstring(shard0_connstr)? { + PageserverProtocol::Libpq => self.try_get_basebackup_libpq(spec, lsn)?, + PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?, }; let mut state = self.state.lock().unwrap(); @@ -1678,6 +1685,8 @@ impl ComputeNode { tls_config = self.compute_ctl_config.tls.clone(); } + self.update_installed_extensions_collection_interval(&spec); + let max_concurrent_connections = self.max_service_connections(compute_state, &spec); // Merge-apply spec & changes to PostgreSQL state. @@ -1742,6 +1751,8 @@ impl ComputeNode { let tls_config = self.tls_config(&spec); + self.update_installed_extensions_collection_interval(&spec); + if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings { info!("tuning pgbouncer"); @@ -2346,10 +2357,20 @@ LIMIT 100", } pub fn spawn_extension_stats_task(&self) { + // Cancel any existing task + if let Some(handle) = self.extension_stats_task.lock().unwrap().take() { + handle.abort(); + } + let conf = self.tokio_conn_conf.clone(); - let installed_extensions_collection_interval = - self.params.installed_extensions_collection_interval; - tokio::spawn(async move { + let atomic_interval = self.params.installed_extensions_collection_interval.clone(); + let mut installed_extensions_collection_interval = + 2 * atomic_interval.load(std::sync::atomic::Ordering::SeqCst); + info!( + "[NEON_EXT_SPAWN] Spawning background installed extensions worker with Timeout: {}", + installed_extensions_collection_interval + ); + let handle = tokio::spawn(async move { // An initial sleep is added to ensure that two collections don't happen at the same time. // The first collection happens during compute startup. tokio::time::sleep(tokio::time::Duration::from_secs( @@ -2362,8 +2383,48 @@ LIMIT 100", loop { interval.tick().await; let _ = installed_extensions(conf.clone()).await; + // Acquire a read lock on the compute spec and then update the interval if necessary + interval = tokio::time::interval(tokio::time::Duration::from_secs(std::cmp::max( + installed_extensions_collection_interval, + 2 * atomic_interval.load(std::sync::atomic::Ordering::SeqCst), + ))); + installed_extensions_collection_interval = interval.period().as_secs(); } }); + + // Store the new task handle + *self.extension_stats_task.lock().unwrap() = Some(handle); + } + + fn terminate_extension_stats_task(&self) { + if let Some(handle) = self.extension_stats_task.lock().unwrap().take() { + handle.abort(); + } + } + + fn update_installed_extensions_collection_interval(&self, spec: &ComputeSpec) { + // Update the interval for collecting installed extensions statistics + // If the value is -1, we never suspend so set the value to default collection. + // If the value is 0, it means default, we will just continue to use the default. + if spec.suspend_timeout_seconds == -1 || spec.suspend_timeout_seconds == 0 { + info!( + "[NEON_EXT_INT_UPD] Spec Timeout: {}, New Timeout: {}", + spec.suspend_timeout_seconds, DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL + ); + self.params.installed_extensions_collection_interval.store( + DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL, + std::sync::atomic::Ordering::SeqCst, + ); + } else { + info!( + "[NEON_EXT_INT_UPD] Spec Timeout: {}", + spec.suspend_timeout_seconds + ); + self.params.installed_extensions_collection_interval.store( + spec.suspend_timeout_seconds as u64, + std::sync::atomic::Ordering::SeqCst, + ); + } } } diff --git a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf index 48b1a6f5c3..f072f652cf 100644 --- a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf +++ b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf @@ -10,7 +10,13 @@ input(type="imfile" File="{log_directory}/*.log" startmsg.regex="^[[:digit:]]{{4}}-[[:digit:]]{{2}}-[[:digit:]]{{2}} [[:digit:]]{{2}}:[[:digit:]]{{2}}:[[:digit:]]{{2}}.[[:digit:]]{{3}} GMT,") # the directory to store rsyslog state files -global(workDirectory="/var/log/rsyslog") +global( + workDirectory="/var/log/rsyslog" + DefaultNetstreamDriverCAFile="/etc/ssl/certs/ca-certificates.crt" +) + +# Whether the remote syslog receiver uses tls +set $.remote_syslog_tls = "{remote_syslog_tls}"; # Construct json, endpoint_id and project_id as additional metadata set $.json_log!endpoint_id = "{endpoint_id}"; @@ -21,5 +27,29 @@ set $.json_log!msg = $msg; template(name="PgAuditLog" type="string" string="<%PRI%>1 %TIMESTAMP:::date-rfc3339% %HOSTNAME% - - - - %$.json_log%") -# Forward to remote syslog receiver (@@:;format -local5.info @@{remote_endpoint};PgAuditLog +# Forward to remote syslog receiver (over TLS) +if ( $syslogtag == 'pgaudit_log' ) then {{ + if ( $.remote_syslog_tls == 'true' ) then {{ + action(type="omfwd" target="{remote_syslog_host}" port="{remote_syslog_port}" protocol="tcp" + template="PgAuditLog" + queue.type="linkedList" + queue.size="1000" + action.ResumeRetryCount="10" + StreamDriver="gtls" + StreamDriverMode="1" + StreamDriverAuthMode="x509/name" + StreamDriverPermittedPeers="{remote_syslog_host}" + StreamDriver.CheckExtendedKeyPurpose="on" + StreamDriver.PermitExpiredCerts="off" + ) + stop + }} else {{ + action(type="omfwd" target="{remote_syslog_host}" port="{remote_syslog_port}" protocol="tcp" + template="PgAuditLog" + queue.type="linkedList" + queue.size="1000" + action.ResumeRetryCount="10" + ) + stop + }} +}} diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs index 930dbc52b4..3346c18c0d 100644 --- a/compute_tools/src/lsn_lease.rs +++ b/compute_tools/src/lsn_lease.rs @@ -4,7 +4,9 @@ use std::thread; use std::time::{Duration, SystemTime}; use anyhow::{Result, bail}; -use compute_api::spec::ComputeMode; +use compute_api::spec::{ComputeMode, PageserverProtocol}; +use itertools::Itertools as _; +use pageserver_page_api as page_api; use postgres::{NoTls, SimpleQueryMessage}; use tracing::{info, warn}; use utils::id::{TenantId, TimelineId}; @@ -76,25 +78,17 @@ fn acquire_lsn_lease_with_retry( loop { // Note: List of pageservers is dynamic, need to re-read configs before each attempt. - let configs = { + let (connstrings, auth) = { let state = compute.state.lock().unwrap(); - let spec = state.pspec.as_ref().expect("spec must be set"); - - let conn_strings = spec.pageserver_connstr.split(','); - - conn_strings - .map(|connstr| { - let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr"); - if let Some(storage_auth_token) = &spec.storage_auth_token { - config.password(storage_auth_token.clone()); - } - config - }) - .collect::>() + ( + spec.pageserver_connstr.clone(), + spec.storage_auth_token.clone(), + ) }; - let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs); + let result = + try_acquire_lsn_lease(&connstrings, auth.as_deref(), tenant_id, timeline_id, lsn); match result { Ok(Some(res)) => { return Ok(res); @@ -116,68 +110,104 @@ fn acquire_lsn_lease_with_retry( } } -/// Tries to acquire an LSN lease through PS page_service API. +/// Tries to acquire LSN leases on all Pageserver shards. fn try_acquire_lsn_lease( + connstrings: &str, + auth: Option<&str>, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn, - configs: &[postgres::Config], ) -> Result> { - fn get_valid_until( - config: &postgres::Config, - tenant_shard_id: TenantShardId, - timeline_id: TimelineId, - lsn: Lsn, - ) -> Result> { - let mut client = config.connect(NoTls)?; - let cmd = format!("lease lsn {tenant_shard_id} {timeline_id} {lsn} "); - let res = client.simple_query(&cmd)?; - let msg = match res.first() { - Some(msg) => msg, - None => bail!("empty response"), - }; - let row = match msg { - SimpleQueryMessage::Row(row) => row, - _ => bail!("error parsing lsn lease response"), + let connstrings = connstrings.split(',').collect_vec(); + let shard_count = connstrings.len(); + let mut leases = Vec::new(); + + for (shard_number, &connstring) in connstrings.iter().enumerate() { + let tenant_shard_id = match shard_count { + 0 | 1 => TenantShardId::unsharded(tenant_id), + shard_count => TenantShardId { + tenant_id, + shard_number: ShardNumber(shard_number as u8), + shard_count: ShardCount::new(shard_count as u8), + }, }; - // Note: this will be None if a lease is explicitly not granted. - let valid_until_str = row.get("valid_until"); - - let valid_until = valid_until_str.map(|s| { - SystemTime::UNIX_EPOCH - .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64)) - .expect("Time larger than max SystemTime could handle") - }); - Ok(valid_until) + let lease = match PageserverProtocol::from_connstring(connstring)? { + PageserverProtocol::Libpq => { + acquire_lsn_lease_libpq(connstring, auth, tenant_shard_id, timeline_id, lsn)? + } + PageserverProtocol::Grpc => { + acquire_lsn_lease_grpc(connstring, auth, tenant_shard_id, timeline_id, lsn)? + } + }; + leases.push(lease); } - let shard_count = configs.len(); + Ok(leases.into_iter().min().flatten()) +} - let valid_until = if shard_count > 1 { - configs - .iter() - .enumerate() - .map(|(shard_number, config)| { - let tenant_shard_id = TenantShardId { - tenant_id, - shard_count: ShardCount::new(shard_count as u8), - shard_number: ShardNumber(shard_number as u8), - }; - get_valid_until(config, tenant_shard_id, timeline_id, lsn) - }) - .collect::>>>()? - .into_iter() - .min() - .unwrap() - } else { - get_valid_until( - &configs[0], - TenantShardId::unsharded(tenant_id), - timeline_id, - lsn, - )? +/// Acquires an LSN lease on a single shard, using the libpq API. The connstring must use a +/// postgresql:// scheme. +fn acquire_lsn_lease_libpq( + connstring: &str, + auth: Option<&str>, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, +) -> Result> { + let mut config = postgres::Config::from_str(connstring)?; + if let Some(auth) = auth { + config.password(auth); + } + let mut client = config.connect(NoTls)?; + let cmd = format!("lease lsn {tenant_shard_id} {timeline_id} {lsn} "); + let res = client.simple_query(&cmd)?; + let msg = match res.first() { + Some(msg) => msg, + None => bail!("empty response"), + }; + let row = match msg { + SimpleQueryMessage::Row(row) => row, + _ => bail!("error parsing lsn lease response"), }; + // Note: this will be None if a lease is explicitly not granted. + let valid_until_str = row.get("valid_until"); + + let valid_until = valid_until_str.map(|s| { + SystemTime::UNIX_EPOCH + .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64)) + .expect("Time larger than max SystemTime could handle") + }); Ok(valid_until) } + +/// Acquires an LSN lease on a single shard, using the gRPC API. The connstring must use a +/// grpc:// scheme. +fn acquire_lsn_lease_grpc( + connstring: &str, + auth: Option<&str>, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, +) -> Result> { + tokio::runtime::Handle::current().block_on(async move { + let mut client = page_api::Client::new( + connstring.to_string(), + tenant_shard_id.tenant_id, + timeline_id, + tenant_shard_id.to_index(), + auth.map(String::from), + None, + ) + .await?; + + let req = page_api::LeaseLsnRequest { lsn }; + match client.lease_lsn(req).await { + Ok(expires) => Ok(Some(expires)), + // Lease couldn't be acquired because the LSN has been garbage collected. + Err(err) if err.code() == tonic::Code::FailedPrecondition => Ok(None), + Err(err) => Err(err.into()), + } + }) +} diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs index 3bc2e72b19..3ced0a5654 100644 --- a/compute_tools/src/rsyslog.rs +++ b/compute_tools/src/rsyslog.rs @@ -4,8 +4,10 @@ use std::path::Path; use std::process::Command; use std::time::Duration; use std::{fs::OpenOptions, io::Write}; +use url::{Host, Url}; use anyhow::{Context, Result, anyhow}; +use hostname_validator; use tracing::{error, info, instrument, warn}; const POSTGRES_LOGS_CONF_PATH: &str = "/etc/rsyslog.d/postgres_logs.conf"; @@ -82,18 +84,84 @@ fn restart_rsyslog() -> Result<()> { Ok(()) } +fn parse_audit_syslog_address( + remote_plain_endpoint: &str, + remote_tls_endpoint: &str, +) -> Result<(String, u16, String)> { + let tls; + let remote_endpoint = if !remote_tls_endpoint.is_empty() { + tls = "true".to_string(); + remote_tls_endpoint + } else { + tls = "false".to_string(); + remote_plain_endpoint + }; + // Urlify the remote_endpoint, so parsing can be done with url::Url. + let url_str = format!("http://{remote_endpoint}"); + let url = Url::parse(&url_str).map_err(|err| { + anyhow!("Error parsing {remote_endpoint}, expected host:port, got {err:?}") + })?; + + let is_valid = url.scheme() == "http" + && url.path() == "/" + && url.query().is_none() + && url.fragment().is_none() + && url.username() == "" + && url.password().is_none(); + + if !is_valid { + return Err(anyhow!( + "Invalid address format {remote_endpoint}, expected host:port" + )); + } + let host = match url.host() { + Some(Host::Domain(h)) if hostname_validator::is_valid(h) => h.to_string(), + Some(Host::Ipv4(ip4)) => ip4.to_string(), + Some(Host::Ipv6(ip6)) => ip6.to_string(), + _ => return Err(anyhow!("Invalid host")), + }; + let port = url + .port() + .ok_or_else(|| anyhow!("Invalid port in {remote_endpoint}"))?; + + Ok((host, port, tls)) +} + +fn generate_audit_rsyslog_config( + log_directory: String, + endpoint_id: &str, + project_id: &str, + remote_syslog_host: &str, + remote_syslog_port: u16, + remote_syslog_tls: &str, +) -> String { + format!( + include_str!("config_template/compute_audit_rsyslog_template.conf"), + log_directory = log_directory, + endpoint_id = endpoint_id, + project_id = project_id, + remote_syslog_host = remote_syslog_host, + remote_syslog_port = remote_syslog_port, + remote_syslog_tls = remote_syslog_tls + ) +} + pub fn configure_audit_rsyslog( log_directory: String, endpoint_id: &str, project_id: &str, remote_endpoint: &str, + remote_tls_endpoint: &str, ) -> Result<()> { - let config_content: String = format!( - include_str!("config_template/compute_audit_rsyslog_template.conf"), - log_directory = log_directory, - endpoint_id = endpoint_id, - project_id = project_id, - remote_endpoint = remote_endpoint + let (remote_syslog_host, remote_syslog_port, remote_syslog_tls) = + parse_audit_syslog_address(remote_endpoint, remote_tls_endpoint).unwrap(); + let config_content = generate_audit_rsyslog_config( + log_directory, + endpoint_id, + project_id, + &remote_syslog_host, + remote_syslog_port, + &remote_syslog_tls, ); info!("rsyslog config_content: {}", config_content); @@ -258,6 +326,8 @@ pub fn launch_pgaudit_gc(log_directory: String) { mod tests { use crate::rsyslog::PostgresLogsRsyslogConfig; + use super::{generate_audit_rsyslog_config, parse_audit_syslog_address}; + #[test] fn test_postgres_logs_config() { { @@ -287,4 +357,146 @@ mod tests { assert!(res.is_err()); } } + + #[test] + fn test_parse_audit_syslog_address() { + { + // host:port format (plaintext) + let parsed = parse_audit_syslog_address("collector.host.tld:5555", ""); + assert!(parsed.is_ok()); + assert_eq!( + parsed.unwrap(), + ( + String::from("collector.host.tld"), + 5555, + String::from("false") + ) + ); + } + + { + // host:port format with ipv4 ip address (plaintext) + let parsed = parse_audit_syslog_address("10.0.0.1:5555", ""); + assert!(parsed.is_ok()); + assert_eq!( + parsed.unwrap(), + (String::from("10.0.0.1"), 5555, String::from("false")) + ); + } + + { + // host:port format with ipv6 ip address (plaintext) + let parsed = + parse_audit_syslog_address("[7e60:82ed:cb2e:d617:f904:f395:aaca:e252]:5555", ""); + assert_eq!( + parsed.unwrap(), + ( + String::from("7e60:82ed:cb2e:d617:f904:f395:aaca:e252"), + 5555, + String::from("false") + ) + ); + } + + { + // Only TLS host:port defined + let parsed = parse_audit_syslog_address("", "tls.host.tld:5556"); + assert_eq!( + parsed.unwrap(), + (String::from("tls.host.tld"), 5556, String::from("true")) + ); + } + + { + // tls host should take precedence, when both defined + let parsed = parse_audit_syslog_address("plaintext.host.tld:5555", "tls.host.tld:5556"); + assert_eq!( + parsed.unwrap(), + (String::from("tls.host.tld"), 5556, String::from("true")) + ); + } + + { + // host without port (plaintext) + let parsed = parse_audit_syslog_address("collector.host.tld", ""); + assert!(parsed.is_err()); + } + + { + // port without host + let parsed = parse_audit_syslog_address(":5555", ""); + assert!(parsed.is_err()); + } + + { + // valid host with invalid port + let parsed = parse_audit_syslog_address("collector.host.tld:90001", ""); + assert!(parsed.is_err()); + } + + { + // invalid hostname with valid port + let parsed = parse_audit_syslog_address("-collector.host.tld:5555", ""); + assert!(parsed.is_err()); + } + + { + // parse error + let parsed = parse_audit_syslog_address("collector.host.tld:::5555", ""); + assert!(parsed.is_err()); + } + } + + #[test] + fn test_generate_audit_rsyslog_config() { + { + // plaintext version + let log_directory = "/tmp/log".to_string(); + let endpoint_id = "ep-test-endpoint-id"; + let project_id = "test-project-id"; + let remote_syslog_host = "collector.host.tld"; + let remote_syslog_port = 5555; + let remote_syslog_tls = "false"; + + let conf_str = generate_audit_rsyslog_config( + log_directory, + endpoint_id, + project_id, + remote_syslog_host, + remote_syslog_port, + remote_syslog_tls, + ); + + assert!(conf_str.contains(r#"set $.remote_syslog_tls = "false";"#)); + assert!(conf_str.contains(r#"type="omfwd""#)); + assert!(conf_str.contains(r#"target="collector.host.tld""#)); + assert!(conf_str.contains(r#"port="5555""#)); + assert!(conf_str.contains(r#"StreamDriverPermittedPeers="collector.host.tld""#)); + } + + { + // TLS version + let log_directory = "/tmp/log".to_string(); + let endpoint_id = "ep-test-endpoint-id"; + let project_id = "test-project-id"; + let remote_syslog_host = "collector.host.tld"; + let remote_syslog_port = 5556; + let remote_syslog_tls = "true"; + + let conf_str = generate_audit_rsyslog_config( + log_directory, + endpoint_id, + project_id, + remote_syslog_host, + remote_syslog_port, + remote_syslog_tls, + ); + + assert!(conf_str.contains(r#"set $.remote_syslog_tls = "true";"#)); + assert!(conf_str.contains(r#"type="omfwd""#)); + assert!(conf_str.contains(r#"target="collector.host.tld""#)); + assert!(conf_str.contains(r#"port="5556""#)); + assert!(conf_str.contains(r#"StreamDriverPermittedPeers="collector.host.tld""#)); + } + } } diff --git a/compute_tools/tests/cluster_spec.json b/compute_tools/tests/cluster_spec.json index 5655a94de4..439d80c057 100644 --- a/compute_tools/tests/cluster_spec.json +++ b/compute_tools/tests/cluster_spec.json @@ -3,7 +3,8 @@ "timestamp": "2021-05-23T18:25:43.511Z", "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b", - + "suspend_timeout_seconds": 3600, + "cluster": { "cluster_id": "test-cluster-42", "name": "Zenith Test", diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index c818d07fef..3440d8979a 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -16,9 +16,9 @@ use std::time::Duration; use anyhow::{Context, Result, anyhow, bail}; use clap::Parser; use compute_api::requests::ComputeClaimsScope; -use compute_api::spec::ComputeMode; +use compute_api::spec::{ComputeMode, PageserverProtocol}; use control_plane::broker::StorageBroker; -use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode, PageserverProtocol}; +use control_plane::endpoint::{ComputeControlPlane, EndpointTerminateMode}; use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_ADDR, EndpointStorage}; use control_plane::local_env; use control_plane::local_env::{ @@ -1649,7 +1649,9 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res // If --safekeepers argument is given, use only the listed // safekeeper nodes; otherwise all from the env. let safekeepers = parse_safekeepers(&args.safekeepers)?; - endpoint.reconfigure(pageservers, None, safekeepers).await?; + endpoint + .reconfigure(Some(pageservers), None, safekeepers, None) + .await?; } EndpointCmd::Stop(args) => { let endpoint_id = &args.endpoint_id; diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index e3faa082db..424101b9a4 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -56,8 +56,8 @@ use compute_api::responses::{ TlsConfig, }; use compute_api::spec::{ - Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, - RemoteExtSpec, Role, + Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PageserverProtocol, + PgIdent, RemoteExtSpec, Role, }; use jsonwebtoken::jwk::{ AlgorithmParameters, CommonParameters, EllipticCurve, Jwk, JwkSet, KeyAlgorithm, KeyOperations, @@ -373,29 +373,6 @@ impl std::fmt::Display for EndpointTerminateMode { } } -/// Protocol used to connect to a Pageserver. -#[derive(Clone, Copy, Debug)] -pub enum PageserverProtocol { - Libpq, - Grpc, -} - -impl PageserverProtocol { - /// Returns the URL scheme for the protocol, used in connstrings. - pub fn scheme(&self) -> &'static str { - match self { - Self::Libpq => "postgresql", - Self::Grpc => "grpc", - } - } -} - -impl Display for PageserverProtocol { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_str(self.scheme()) - } -} - impl Endpoint { fn from_dir_entry(entry: std::fs::DirEntry, env: &LocalEnv) -> Result { if !entry.file_type()?.is_dir() { @@ -803,6 +780,7 @@ impl Endpoint { endpoint_storage_addr: Some(endpoint_storage_addr), endpoint_storage_token: Some(endpoint_storage_token), autoprewarm: false, + suspend_timeout_seconds: -1, // Only used in neon_local. }; // this strange code is needed to support respec() in tests @@ -997,12 +975,11 @@ impl Endpoint { pub async fn reconfigure( &self, - pageservers: Vec<(PageserverProtocol, Host, u16)>, + pageservers: Option>, stripe_size: Option, safekeepers: Option>, + safekeeper_generation: Option, ) -> Result<()> { - anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided"); - let (mut spec, compute_ctl_config) = { let config_path = self.endpoint_path().join("config.json"); let file = std::fs::File::open(config_path)?; @@ -1014,16 +991,24 @@ impl Endpoint { let postgresql_conf = self.read_postgresql_conf()?; spec.cluster.postgresql_conf = Some(postgresql_conf); - let pageserver_connstr = Self::build_pageserver_connstr(&pageservers); - spec.pageserver_connstring = Some(pageserver_connstr); - if stripe_size.is_some() { - spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize); + // If pageservers are not specified, don't change them. + if let Some(pageservers) = pageservers { + anyhow::ensure!(!pageservers.is_empty(), "no pageservers provided"); + + let pageserver_connstr = Self::build_pageserver_connstr(&pageservers); + spec.pageserver_connstring = Some(pageserver_connstr); + if stripe_size.is_some() { + spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize); + } } // If safekeepers are not specified, don't change them. if let Some(safekeepers) = safekeepers { let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?; spec.safekeeper_connstrings = safekeeper_connstrings; + if let Some(g) = safekeeper_generation { + spec.safekeepers_generation = Some(g.into_inner()); + } } let client = reqwest::Client::builder() @@ -1061,6 +1046,24 @@ impl Endpoint { } } + pub async fn reconfigure_pageservers( + &self, + pageservers: Vec<(PageserverProtocol, Host, u16)>, + stripe_size: Option, + ) -> Result<()> { + self.reconfigure(Some(pageservers), stripe_size, None, None) + .await + } + + pub async fn reconfigure_safekeepers( + &self, + safekeepers: Vec, + generation: SafekeeperGeneration, + ) -> Result<()> { + self.reconfigure(None, None, Some(safekeepers), Some(generation)) + .await + } + pub async fn stop( &self, mode: EndpointTerminateMode, diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 16cd2d8c08..d0611113e8 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -212,7 +212,7 @@ pub struct NeonStorageControllerConf { pub use_local_compute_notifications: bool, - pub timeline_safekeeper_count: Option, + pub timeline_safekeeper_count: Option, pub posthog_config: Option, diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index dea7ae2ccf..bb83a6319c 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -638,7 +638,13 @@ impl StorageController { args.push("--timelines-onto-safekeepers".to_string()); } - if let Some(sk_cnt) = self.config.timeline_safekeeper_count { + // neon_local is used in test environments where we often have less than 3 safekeepers. + if self.config.timeline_safekeeper_count.is_some() || self.env.safekeepers.len() < 3 { + let sk_cnt = self + .config + .timeline_safekeeper_count + .unwrap_or(self.env.safekeepers.len()); + args.push(format!("--timeline-safekeeper-count={sk_cnt}")); } diff --git a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json index 21caf3800c..60e232425b 100644 --- a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json +++ b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json @@ -4,6 +4,7 @@ "timestamp": "2022-10-12T18:00:00.000Z", "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c", + "suspend_timeout_seconds": -1, "cluster": { "cluster_id": "docker_compose", diff --git a/libs/compute_api/Cargo.toml b/libs/compute_api/Cargo.toml index 81b0cd19a1..83cb639f0a 100644 --- a/libs/compute_api/Cargo.toml +++ b/libs/compute_api/Cargo.toml @@ -12,6 +12,7 @@ jsonwebtoken.workspace = true serde.workspace = true serde_json.workspace = true regex.workspace = true +url.workspace = true utils = { path = "../utils" } remote_storage = { version = "0.1", path = "../remote_storage/" } diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 0e23b70265..6b2caa9d3a 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -4,11 +4,14 @@ //! provide it by calling the compute_ctl's `/compute_ctl` endpoint, or //! compute_ctl can fetch it by calling the control plane's API. use std::collections::HashMap; +use std::fmt::Display; +use anyhow::anyhow; use indexmap::IndexMap; use regex::Regex; use remote_storage::RemotePath; use serde::{Deserialize, Serialize}; +use url::Url; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; @@ -181,6 +184,11 @@ pub struct ComputeSpec { /// Download LFC state from endpoint_storage and pass it to Postgres on startup #[serde(default)] pub autoprewarm: bool, + + /// Suspend timeout in seconds. + /// + /// We use this value to derive other values, such as the installed extensions metric. + pub suspend_timeout_seconds: i64, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. @@ -429,6 +437,47 @@ pub struct JwksSettings { pub jwt_audience: Option, } +/// Protocol used to connect to a Pageserver. Parsed from the connstring scheme. +#[derive(Clone, Copy, Debug, Default)] +pub enum PageserverProtocol { + /// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme. + #[default] + Libpq, + /// A newer, gRPC-based protocol. Uses grpc:// scheme. + Grpc, +} + +impl PageserverProtocol { + /// Parses the protocol from a connstring scheme. Defaults to Libpq if no scheme is given. + /// Errors if the connstring is an invalid URL. + pub fn from_connstring(connstring: &str) -> anyhow::Result { + let scheme = match Url::parse(connstring) { + Ok(url) => url.scheme().to_lowercase(), + Err(url::ParseError::RelativeUrlWithoutBase) => return Ok(Self::default()), + Err(err) => return Err(anyhow!("invalid connstring URL: {err}")), + }; + match scheme.as_str() { + "postgresql" | "postgres" => Ok(Self::Libpq), + "grpc" => Ok(Self::Grpc), + scheme => Err(anyhow!("invalid protocol scheme: {scheme}")), + } + } + + /// Returns the URL scheme for the protocol, for use in connstrings. + pub fn scheme(&self) -> &'static str { + match self { + Self::Libpq => "postgresql", + Self::Grpc => "grpc", + } + } +} + +impl Display for PageserverProtocol { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.scheme()) + } +} + #[cfg(test)] mod tests { use std::fs::File; diff --git a/libs/compute_api/tests/cluster_spec.json b/libs/compute_api/tests/cluster_spec.json index 2dd2aae015..94d7f1e081 100644 --- a/libs/compute_api/tests/cluster_spec.json +++ b/libs/compute_api/tests/cluster_spec.json @@ -3,6 +3,7 @@ "timestamp": "2021-05-23T18:25:43.511Z", "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8b", + "suspend_timeout_seconds": 3600, "cluster": { "cluster_id": "test-cluster-42", diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index 6dc17b670b..7accbdabca 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -30,12 +30,13 @@ humantime-serde.workspace = true chrono = { workspace = true, features = ["serde"] } itertools.workspace = true storage_broker.workspace = true -camino = {workspace = true, features = ["serde1"]} +camino = { workspace = true, features = ["serde1"] } remote_storage.workspace = true postgres_backend.workspace = true -nix = {workspace = true, optional = true} +nix = { workspace = true, optional = true } reqwest.workspace = true rand.workspace = true +tracing.workspace = true tracing-utils.workspace = true once_cell.workspace = true diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index ff18d40bfe..a8080a57e9 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -546,6 +546,11 @@ pub struct TimelineImportRequest { pub sk_set: Vec, } +#[derive(serde::Serialize, serde::Deserialize, Clone)] +pub struct TimelineSafekeeperMigrateRequest { + pub new_sk_set: Vec, +} + #[cfg(test)] mod test { use serde_json; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 82a3ac0eb4..5f36e8d2b7 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -21,7 +21,9 @@ use utils::{completion, serde_system_time}; use crate::config::Ratio; use crate::key::{CompactKey, Key}; -use crate::shard::{DEFAULT_STRIPE_SIZE, ShardCount, ShardStripeSize, TenantShardId}; +use crate::shard::{ + DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardStripeSize, TenantShardId, +}; /// The state of a tenant in this pageserver. /// @@ -475,7 +477,7 @@ pub struct TenantShardSplitResponse { } /// Parameters that apply to all shards in a tenant. Used during tenant creation. -#[derive(Serialize, Deserialize, Debug)] +#[derive(Clone, Copy, Serialize, Deserialize, Debug)] #[serde(deny_unknown_fields)] pub struct ShardParameters { pub count: ShardCount, @@ -497,6 +499,15 @@ impl Default for ShardParameters { } } +impl From for ShardParameters { + fn from(identity: ShardIdentity) -> Self { + Self { + count: identity.count, + stripe_size: identity.stripe_size, + } + } +} + #[derive(Debug, Default, Clone, Eq, PartialEq)] pub enum FieldPatch { Upsert(T), @@ -1724,6 +1735,7 @@ pub enum DownloadRemoteLayersTaskState { #[derive(Debug, Serialize, Deserialize)] pub struct TimelineGcRequest { pub gc_horizon: Option, + pub ignore_lease_deadline: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 9c16be93e8..5a13aace64 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -37,6 +37,7 @@ use std::hash::{Hash, Hasher}; pub use ::utils::shard::*; use postgres_ffi_types::forknum::INIT_FORKNUM; use serde::{Deserialize, Serialize}; +use utils::critical; use crate::key::Key; use crate::models::ShardParameters; @@ -179,7 +180,7 @@ impl ShardIdentity { /// For use when creating ShardIdentity instances for new shards, where a creation request /// specifies the ShardParameters that apply to all shards. - pub fn from_params(number: ShardNumber, params: &ShardParameters) -> Self { + pub fn from_params(number: ShardNumber, params: ShardParameters) -> Self { Self { number, count: params.count, @@ -188,6 +189,17 @@ impl ShardIdentity { } } + /// Asserts that the given shard identities are equal. Changes to shard parameters will likely + /// result in data corruption. + pub fn assert_equal(&self, other: ShardIdentity) { + if self != &other { + // TODO: for now, we're conservative and just log errors in production. Turn this into a + // real assertion when we're confident it doesn't misfire, and also reject requests that + // attempt to change it with an error response. + critical!("shard identity mismatch: {self:?} != {other:?}"); + } + } + fn is_broken(&self) -> bool { self.layout == LAYOUT_BROKEN } diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index 0c5a5c945f..7042e42e66 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -173,7 +173,7 @@ pub struct TimelineStatus { } /// Request to switch membership configuration. -#[derive(Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize)] #[serde(transparent)] pub struct TimelineMembershipSwitchRequest { pub mconf: Configuration, @@ -184,6 +184,8 @@ pub struct TimelineMembershipSwitchRequest { pub struct TimelineMembershipSwitchResponse { pub previous_conf: Configuration, pub current_conf: Configuration, + pub term: Term, + pub flush_lsn: Lsn, } #[derive(Clone, Copy, Serialize, Deserialize)] diff --git a/pageserver/page_api/Cargo.toml b/pageserver/page_api/Cargo.toml index 42ee9b50e9..fbad8cf9d0 100644 --- a/pageserver/page_api/Cargo.toml +++ b/pageserver/page_api/Cargo.toml @@ -11,6 +11,7 @@ futures.workspace = true pageserver_api.workspace = true postgres_ffi_types.workspace = true prost.workspace = true +prost-types.workspace = true strum.workspace = true strum_macros.workspace = true thiserror.workspace = true diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto index d06b2cfca5..1d6c230916 100644 --- a/pageserver/page_api/proto/page_service.proto +++ b/pageserver/page_api/proto/page_service.proto @@ -35,6 +35,8 @@ syntax = "proto3"; package page_api; +import "google/protobuf/timestamp.proto"; + service PageService { // Returns whether a relation exists. rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse); @@ -64,6 +66,10 @@ service PageService { // Fetches an SLRU segment. rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse); + + // Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage + // collect the LSN until the lease expires. Must be acquired on all relevant shards. + rpc LeaseLsn (LeaseLsnRequest) returns (LeaseLsnResponse); } // The LSN a request should read at. @@ -252,3 +258,17 @@ message GetSlruSegmentRequest { message GetSlruSegmentResponse { bytes segment = 1; } + +// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage +// collect the LSN until the lease expires. Must be acquired on all relevant shards. +message LeaseLsnRequest { + // The LSN to lease. Can't be 0 or below the current GC cutoff. + uint64 lsn = 1; +} + +// Lease acquisition response. If the lease could not be granted because the LSN has already been +// garbage collected, a FailedPrecondition status will be returned instead. +message LeaseLsnResponse { + // The lease expiration time. + google.protobuf.Timestamp expires = 1; +} diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs index 4b456787d2..65e41540b8 100644 --- a/pageserver/page_api/src/client.rs +++ b/pageserver/page_api/src/client.rs @@ -187,4 +187,17 @@ impl Client { let response = self.client.get_slru_segment(proto_req).await?; Ok(response.into_inner().try_into()?) } + + /// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't + /// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards. + /// + /// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be + /// acquired because the LSN has already been garbage collected. + pub async fn lease_lsn( + &mut self, + req: model::LeaseLsnRequest, + ) -> Result { + let req = proto::LeaseLsnRequest::from(req); + Ok(self.client.lease_lsn(req).await?.into_inner().try_into()?) + } } diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs index 0493f79781..4497fc6fc7 100644 --- a/pageserver/page_api/src/model.rs +++ b/pageserver/page_api/src/model.rs @@ -16,6 +16,7 @@ //! stream combinators without dealing with errors, and avoids validating the same message twice. use std::fmt::Display; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; use bytes::Bytes; use postgres_ffi_types::Oid; @@ -703,3 +704,54 @@ impl From for proto::GetSlruSegmentResponse { // SlruKind is defined in pageserver_api::reltag. pub type SlruKind = pageserver_api::reltag::SlruKind; + +/// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't garbage +/// collect the LSN until the lease expires. +pub struct LeaseLsnRequest { + /// The LSN to lease. + pub lsn: Lsn, +} + +impl TryFrom for LeaseLsnRequest { + type Error = ProtocolError; + + fn try_from(pb: proto::LeaseLsnRequest) -> Result { + if pb.lsn == 0 { + return Err(ProtocolError::Missing("lsn")); + } + Ok(Self { lsn: Lsn(pb.lsn) }) + } +} + +impl From for proto::LeaseLsnRequest { + fn from(request: LeaseLsnRequest) -> Self { + Self { lsn: request.lsn.0 } + } +} + +/// Lease expiration time. If the lease could not be granted because the LSN has already been +/// garbage collected, a FailedPrecondition status will be returned instead. +pub type LeaseLsnResponse = SystemTime; + +impl TryFrom for LeaseLsnResponse { + type Error = ProtocolError; + + fn try_from(pb: proto::LeaseLsnResponse) -> Result { + let expires = pb.expires.ok_or(ProtocolError::Missing("expires"))?; + UNIX_EPOCH + .checked_add(Duration::new(expires.seconds as u64, expires.nanos as u32)) + .ok_or_else(|| ProtocolError::invalid("expires", expires)) + } +} + +impl From for proto::LeaseLsnResponse { + fn from(response: LeaseLsnResponse) -> Self { + let expires = response.duration_since(UNIX_EPOCH).unwrap_or_default(); + Self { + expires: Some(prost_types::Timestamp { + seconds: expires.as_secs() as i64, + nanos: expires.subsec_nanos() as i32, + }), + } + } +} diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index aa9bec657c..119275f885 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1893,9 +1893,13 @@ async fn update_tenant_config_handler( let location_conf = LocationConf::attached_single( new_tenant_conf.clone(), tenant.get_generation(), - &ShardParameters::default(), + ShardParameters::from(tenant.get_shard_identity()), ); + tenant + .get_shard_identity() + .assert_equal(location_conf.shard); // not strictly necessary since we construct it above + crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) .await .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; @@ -1937,9 +1941,13 @@ async fn patch_tenant_config_handler( let location_conf = LocationConf::attached_single( updated, tenant.get_generation(), - &ShardParameters::default(), + ShardParameters::from(tenant.get_shard_identity()), ); + tenant + .get_shard_identity() + .assert_equal(location_conf.shard); // not strictly necessary since we construct it above + crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) .await .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index c04f6e2b47..1d824ac846 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -14,6 +14,7 @@ use std::{io, str}; use anyhow::{Context as _, bail}; use bytes::{Buf as _, BufMut as _, BytesMut}; +use chrono::Utc; use futures::future::BoxFuture; use futures::{FutureExt, Stream}; use itertools::Itertools; @@ -3760,6 +3761,36 @@ impl proto::PageService for GrpcPageServiceHandler { let resp: page_api::GetSlruSegmentResponse = resp.segment; Ok(tonic::Response::new(resp.into())) } + + #[instrument(skip_all, fields(lsn))] + async fn lease_lsn( + &self, + req: tonic::Request, + ) -> Result, tonic::Status> { + let timeline = self.get_request_timeline(&req).await?; + let ctx = self.ctx.with_scope_timeline(&timeline); + + // Validate and convert the request, and decorate the span. + let req: page_api::LeaseLsnRequest = req.into_inner().try_into()?; + + span_record!(lsn=%req.lsn); + + // Attempt to acquire a lease. Return FailedPrecondition if the lease could not be granted. + let lease_length = timeline.get_lsn_lease_length(); + let expires = match timeline.renew_lsn_lease(req.lsn, lease_length, &ctx) { + Ok(lease) => lease.valid_until, + Err(err) => return Err(tonic::Status::failed_precondition(format!("{err}"))), + }; + + // TODO: is this spammy? Move it compute-side? + info!( + "acquired lease for {} until {}", + req.lsn, + chrono::DateTime::::from(expires).to_rfc3339() + ); + + Ok(tonic::Response::new(expires.into())) + } } /// gRPC middleware layer that handles observability concerns: diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 09a7a8a651..31f38d485f 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -3015,7 +3015,7 @@ mod tests { // This shard will get the even blocks let shard = ShardIdentity::from_params( ShardNumber(0), - &ShardParameters { + ShardParameters { count: ShardCount(2), stripe_size: ShardStripeSize(1), }, diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 2e9dbdc539..39d0fcd852 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -34,7 +34,7 @@ use once_cell::sync::Lazy; pub use pageserver_api::models::TenantState; use pageserver_api::models::{self, RelSizeMigration}; use pageserver_api::models::{ - CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem, + CompactInfoResponse, TimelineArchivalState, TimelineState, TopTenantShardItem, WalRedoManagerStatus, }; use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId}; @@ -180,6 +180,7 @@ pub(super) struct AttachedTenantConf { impl AttachedTenantConf { fn new( + conf: &'static PageServerConf, tenant_conf: pageserver_api::models::TenantConfig, location: AttachedLocationConfig, ) -> Self { @@ -191,9 +192,7 @@ impl AttachedTenantConf { let lsn_lease_deadline = if location.attach_mode == AttachmentMode::Single { Some( tokio::time::Instant::now() - + tenant_conf - .lsn_lease_length - .unwrap_or(LsnLease::DEFAULT_LENGTH), + + TenantShard::get_lsn_lease_length_impl(conf, &tenant_conf), ) } else { // We don't use `lsn_lease_deadline` to delay GC in AttachedMulti and AttachedStale @@ -208,10 +207,13 @@ impl AttachedTenantConf { } } - fn try_from(location_conf: LocationConf) -> anyhow::Result { + fn try_from( + conf: &'static PageServerConf, + location_conf: LocationConf, + ) -> anyhow::Result { match &location_conf.mode { LocationMode::Attached(attach_conf) => { - Ok(Self::new(location_conf.tenant_conf, *attach_conf)) + Ok(Self::new(conf, location_conf.tenant_conf, *attach_conf)) } LocationMode::Secondary(_) => { anyhow::bail!( @@ -1145,6 +1147,22 @@ pub(crate) enum LoadConfigError { NotFound(Utf8PathBuf), } +pub(crate) enum IgnoreLeaseDeadline { + Default, + No, + Yes, +} + +impl From> for IgnoreLeaseDeadline { + fn from(value: Option) -> Self { + match value { + None => IgnoreLeaseDeadline::Default, + Some(false) => IgnoreLeaseDeadline::No, + Some(true) => IgnoreLeaseDeadline::Yes, + } + } +} + impl TenantShard { /// Yet another helper for timeline initialization. /// @@ -3076,6 +3094,7 @@ impl TenantShard { target_timeline_id: Option, horizon: u64, pitr: Duration, + ignore_lease_deadline: IgnoreLeaseDeadline, cancel: &CancellationToken, ctx: &RequestContext, ) -> Result { @@ -3101,7 +3120,23 @@ impl TenantShard { return Ok(GcResult::default()); } - if conf.is_gc_blocked_by_lsn_lease_deadline() { + // Skip GC if we're within lease deadline. + // + // Rust & Python tests set single-digit second gc_period and/or + // do immediate gc via mgmt API. The lease deadline is hard-coded to + // 10min, which would make most if not all tests skip GC here. + // Rust tests could in theory tokio::time::advance, but Python + // tests have no such options. + // Since lease deadline is a crutch we hopefully eventually replace + // with durable leases, take a shortcut here and skip lease deadline check + // for all tests. + // Cf https://databricks.atlassian.net/browse/LKB-92?focusedCommentId=6722329 + let ignore_lease_deadline = match ignore_lease_deadline { + IgnoreLeaseDeadline::Default => cfg!(test) || cfg!(feature = "testing"), + IgnoreLeaseDeadline::No => false, + IgnoreLeaseDeadline::Yes => true, + }; + if !ignore_lease_deadline && conf.is_gc_blocked_by_lsn_lease_deadline() { info!("Skipping GC because lsn lease deadline is not reached"); return Ok(GcResult::default()); } @@ -3872,6 +3907,10 @@ impl TenantShard { &self.tenant_shard_id } + pub(crate) fn get_shard_identity(&self) -> ShardIdentity { + self.shard_identity + } + pub(crate) fn get_shard_stripe_size(&self) -> ShardStripeSize { self.shard_identity.stripe_size } @@ -4197,10 +4236,16 @@ impl TenantShard { } pub fn get_lsn_lease_length(&self) -> Duration { - let tenant_conf = self.tenant_conf.load().tenant_conf.clone(); + Self::get_lsn_lease_length_impl(self.conf, &self.tenant_conf.load().tenant_conf) + } + + pub fn get_lsn_lease_length_impl( + conf: &'static PageServerConf, + tenant_conf: &pageserver_api::models::TenantConfig, + ) -> Duration { tenant_conf .lsn_lease_length - .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length) + .unwrap_or(conf.default_tenant_conf.lsn_lease_length) } pub fn get_timeline_offloading_enabled(&self) -> bool { @@ -4525,6 +4570,10 @@ impl TenantShard { Ok(toml_edit::de::from_str::(&config)?) } + /// Stores a tenant location config to disk. + /// + /// NB: make sure to call `ShardIdentity::assert_equal` before persisting a new config, to avoid + /// changes to shard parameters that may result in data corruption. #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] pub(super) async fn persist_tenant_config( conf: &'static PageServerConf, @@ -6008,7 +6057,7 @@ pub(crate) mod harness { AttachedTenantConf::try_from(LocationConf::attached_single( self.tenant_conf.clone(), self.generation, - &ShardParameters::default(), + ShardParameters::default(), )) .unwrap(), self.shard_identity, @@ -6660,17 +6709,13 @@ mod tests { tline.freeze_and_flush().await.map_err(|e| e.into()) } - #[tokio::test(start_paused = true)] + #[tokio::test] async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data") .await? .load() .await; - // Advance to the lsn lease deadline so that GC is not blocked by - // initial transition into AttachedSingle. - tokio::time::advance(tenant.get_lsn_lease_length()).await; - tokio::time::resume(); let tline = tenant .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) .await?; @@ -6685,6 +6730,7 @@ mod tests { Some(TIMELINE_ID), 0x10, Duration::ZERO, + IgnoreLeaseDeadline::Default, &CancellationToken::new(), &ctx, ) @@ -6798,6 +6844,7 @@ mod tests { Some(TIMELINE_ID), 0x10, Duration::ZERO, + IgnoreLeaseDeadline::Default, &CancellationToken::new(), &ctx, ) @@ -6858,6 +6905,7 @@ mod tests { Some(TIMELINE_ID), 0x10, Duration::ZERO, + IgnoreLeaseDeadline::Default, &CancellationToken::new(), &ctx, ) @@ -6892,6 +6940,7 @@ mod tests { Some(TIMELINE_ID), 0x10, Duration::ZERO, + IgnoreLeaseDeadline::Default, &CancellationToken::new(), &ctx, ) @@ -7173,7 +7222,14 @@ mod tests { // this doesn't really need to use the timeline_id target, but it is closer to what it // originally was. let res = tenant - .gc_iteration(Some(timeline.timeline_id), 0, Duration::ZERO, &cancel, ctx) + .gc_iteration( + Some(timeline.timeline_id), + 0, + Duration::ZERO, + IgnoreLeaseDeadline::Default, + &cancel, + ctx, + ) .await?; assert_eq!(res.layers_removed, 0, "this never removes anything"); @@ -7791,7 +7847,14 @@ mod tests { // Perform a cycle of flush, and GC tline.freeze_and_flush().await?; tenant - .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .gc_iteration( + Some(tline.timeline_id), + 0, + Duration::ZERO, + IgnoreLeaseDeadline::Default, + &cancel, + &ctx, + ) .await?; } @@ -7882,7 +7945,14 @@ mod tests { tline.freeze_and_flush().await?; tline.compact(&cancel, EnumSet::default(), &ctx).await?; tenant - .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .gc_iteration( + Some(tline.timeline_id), + 0, + Duration::ZERO, + IgnoreLeaseDeadline::Default, + &cancel, + &ctx, + ) .await?; } @@ -8218,7 +8288,14 @@ mod tests { ) .await?; tenant - .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx) + .gc_iteration( + Some(tline.timeline_id), + 0, + Duration::ZERO, + IgnoreLeaseDeadline::Default, + &cancel, + &ctx, + ) .await?; } } @@ -9369,17 +9446,13 @@ mod tests { Ok(()) } - #[tokio::test(start_paused = true)] + #[tokio::test] async fn test_lsn_lease() -> anyhow::Result<()> { let (tenant, ctx) = TenantHarness::create("test_lsn_lease") .await .unwrap() .load() .await; - // Advance to the lsn lease deadline so that GC is not blocked by - // initial transition into AttachedSingle. - tokio::time::advance(tenant.get_lsn_lease_length()).await; - tokio::time::resume(); let key = Key::from_hex("010000000033333333444444445500000000").unwrap(); let end_lsn = Lsn(0x100); @@ -9443,6 +9516,7 @@ mod tests { Some(TIMELINE_ID), 0, Duration::ZERO, + IgnoreLeaseDeadline::Default, &CancellationToken::new(), &ctx, ) diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index c5087f7e0f..67df767abd 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -12,6 +12,7 @@ use pageserver_api::models; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; use serde::{Deserialize, Serialize}; +use utils::critical; use utils::generation::Generation; #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] @@ -136,7 +137,7 @@ impl LocationConf { pub(crate) fn attached_single( tenant_conf: pageserver_api::models::TenantConfig, generation: Generation, - shard_params: &models::ShardParameters, + shard_params: models::ShardParameters, ) -> Self { Self { mode: LocationMode::Attached(AttachedLocationConfig { @@ -171,6 +172,16 @@ impl LocationConf { } } + // This should never happen. + // TODO: turn this into a proper assertion. + if stripe_size != self.shard.stripe_size { + critical!( + "stripe size mismatch: {} != {}", + self.shard.stripe_size, + stripe_size, + ); + } + self.shard.stripe_size = stripe_size; } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 248d92622e..2888fdeac0 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -664,7 +664,7 @@ pub async fn init_tenant_mgr( tenant_shard_id, &tenant_dir_path, resources.clone(), - AttachedTenantConf::new(location_conf.tenant_conf, attached_conf), + AttachedTenantConf::new(conf, location_conf.tenant_conf, attached_conf), shard_identity, Some(init_order.clone()), SpawnMode::Lazy, @@ -842,7 +842,7 @@ impl TenantManager { // take our fast path and just provide the updated configuration // to the tenant. tenant.set_new_location_config( - AttachedTenantConf::try_from(new_location_config.clone()) + AttachedTenantConf::try_from(self.conf, new_location_config.clone()) .map_err(UpsertLocationError::BadRequest)?, ); @@ -880,6 +880,9 @@ impl TenantManager { // phase of writing config and/or waiting for flush, before returning. match fast_path_taken { Some(FastPathModified::Attached(tenant)) => { + tenant + .shard_identity + .assert_equal(new_location_config.shard); TenantShard::persist_tenant_config( self.conf, &tenant_shard_id, @@ -914,7 +917,10 @@ impl TenantManager { return Ok(Some(tenant)); } - Some(FastPathModified::Secondary(_secondary_tenant)) => { + Some(FastPathModified::Secondary(secondary_tenant)) => { + secondary_tenant + .shard_identity + .assert_equal(new_location_config.shard); TenantShard::persist_tenant_config( self.conf, &tenant_shard_id, @@ -948,6 +954,10 @@ impl TenantManager { match slot_guard.get_old_value() { Some(TenantSlot::Attached(tenant)) => { + tenant + .shard_identity + .assert_equal(new_location_config.shard); + // The case where we keep a Tenant alive was covered above in the special case // for Attached->Attached transitions in the same generation. By this point, // if we see an attached tenant we know it will be discarded and should be @@ -981,9 +991,13 @@ impl TenantManager { // rather than assuming it to be empty. spawn_mode = SpawnMode::Eager; } - Some(TenantSlot::Secondary(state)) => { + Some(TenantSlot::Secondary(secondary_tenant)) => { + secondary_tenant + .shard_identity + .assert_equal(new_location_config.shard); + info!("Shutting down secondary tenant"); - state.shutdown().await; + secondary_tenant.shutdown().await; } Some(TenantSlot::InProgress(_)) => { // This should never happen: acquire_slot should error out @@ -1032,7 +1046,7 @@ impl TenantManager { // Testing hack: if we are configured with no control plane, then drop the generation // from upserts. This enables creating generation-less tenants even though neon_local // always uses generations when calling the location conf API. - let attached_conf = AttachedTenantConf::try_from(new_location_config) + let attached_conf = AttachedTenantConf::try_from(self.conf, new_location_config) .map_err(UpsertLocationError::BadRequest)?; let tenant = tenant_spawn( @@ -1236,7 +1250,7 @@ impl TenantManager { tenant_shard_id, &tenant_path, self.resources.clone(), - AttachedTenantConf::try_from(config)?, + AttachedTenantConf::try_from(self.conf, config)?, shard_identity, None, SpawnMode::Eager, @@ -2117,7 +2131,7 @@ impl TenantManager { tenant_shard_id, &tenant_path, self.resources.clone(), - AttachedTenantConf::try_from(config).map_err(Error::DetachReparent)?, + AttachedTenantConf::try_from(self.conf, config).map_err(Error::DetachReparent)?, shard_identity, None, SpawnMode::Eager, @@ -2351,7 +2365,14 @@ impl TenantManager { #[allow(unused_mut)] let mut result = tenant - .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx) + .gc_iteration( + Some(timeline_id), + gc_horizon, + pitr, + crate::tenant::IgnoreLeaseDeadline::from(gc_req.ignore_lease_deadline), + &cancel, + &ctx, + ) .await; // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it // better once the types support it. diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 2fa0ed9be9..e06788543a 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -101,7 +101,7 @@ pub(crate) struct SecondaryTenant { // Secondary mode does not need the full shard identity or the pageserver_api::models::TenantConfig. However, // storing these enables us to report our full LocationConf, enabling convenient reconciliation // by the control plane (see [`Self::get_location_conf`]) - shard_identity: ShardIdentity, + pub(crate) shard_identity: ShardIdentity, tenant_conf: std::sync::Mutex, // Internal state used by the Downloader. diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index 1d50a5f3a0..7f0ca5b337 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -55,11 +55,11 @@ pub struct BatchLayerWriter { } impl BatchLayerWriter { - pub async fn new(conf: &'static PageServerConf) -> anyhow::Result { - Ok(Self { + pub fn new(conf: &'static PageServerConf) -> Self { + Self { generated_layer_writers: Vec::new(), conf, - }) + } } pub fn add_unfinished_image_writer( @@ -182,7 +182,7 @@ impl BatchLayerWriter { /// An image writer that takes images and produces multiple image layers. #[must_use] pub struct SplitImageLayerWriter<'a> { - inner: ImageLayerWriter, + inner: Option, target_layer_size: u64, lsn: Lsn, conf: &'static PageServerConf, @@ -196,7 +196,7 @@ pub struct SplitImageLayerWriter<'a> { impl<'a> SplitImageLayerWriter<'a> { #[allow(clippy::too_many_arguments)] - pub async fn new( + pub fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, @@ -205,30 +205,19 @@ impl<'a> SplitImageLayerWriter<'a> { target_layer_size: u64, gate: &'a utils::sync::gate::Gate, cancel: CancellationToken, - ctx: &RequestContext, - ) -> anyhow::Result { - Ok(Self { + ) -> Self { + Self { target_layer_size, - inner: ImageLayerWriter::new( - conf, - timeline_id, - tenant_shard_id, - &(start_key..Key::MAX), - lsn, - gate, - cancel.clone(), - ctx, - ) - .await?, + inner: None, conf, timeline_id, tenant_shard_id, - batches: BatchLayerWriter::new(conf).await?, + batches: BatchLayerWriter::new(conf), lsn, start_key, gate, cancel, - }) + } } pub async fn put_image( @@ -237,12 +226,31 @@ impl<'a> SplitImageLayerWriter<'a> { img: Bytes, ctx: &RequestContext, ) -> Result<(), PutError> { + if self.inner.is_none() { + self.inner = Some( + ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &(self.start_key..Key::MAX), + self.lsn, + self.gate, + self.cancel.clone(), + ctx, + ) + .await + .map_err(PutError::Other)?, + ); + } + + let inner = self.inner.as_mut().unwrap(); + // The current estimation is an upper bound of the space that the key/image could take // because we did not consider compression in this estimation. The resulting image layer // could be smaller than the target size. let addition_size_estimation = KEY_SIZE as u64 + img.len() as u64; - if self.inner.num_keys() >= 1 - && self.inner.estimated_size() + addition_size_estimation >= self.target_layer_size + if inner.num_keys() >= 1 + && inner.estimated_size() + addition_size_estimation >= self.target_layer_size { let next_image_writer = ImageLayerWriter::new( self.conf, @@ -256,7 +264,7 @@ impl<'a> SplitImageLayerWriter<'a> { ) .await .map_err(PutError::Other)?; - let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer); + let prev_image_writer = std::mem::replace(inner, next_image_writer); self.batches.add_unfinished_image_writer( prev_image_writer, self.start_key..key, @@ -264,7 +272,7 @@ impl<'a> SplitImageLayerWriter<'a> { ); self.start_key = key; } - self.inner.put_image(key, img, ctx).await + inner.put_image(key, img, ctx).await } pub(crate) async fn finish_with_discard_fn( @@ -281,8 +289,10 @@ impl<'a> SplitImageLayerWriter<'a> { let Self { mut batches, inner, .. } = self; - if inner.num_keys() != 0 { - batches.add_unfinished_image_writer(inner, self.start_key..end_key, self.lsn); + if let Some(inner) = inner { + if inner.num_keys() != 0 { + batches.add_unfinished_image_writer(inner, self.start_key..end_key, self.lsn); + } } batches.finish_with_discard_fn(tline, ctx, discard_fn).await } @@ -319,7 +329,7 @@ pub struct SplitDeltaLayerWriter<'a> { } impl<'a> SplitDeltaLayerWriter<'a> { - pub async fn new( + pub fn new( conf: &'static PageServerConf, timeline_id: TimelineId, tenant_shard_id: TenantShardId, @@ -327,8 +337,8 @@ impl<'a> SplitDeltaLayerWriter<'a> { target_layer_size: u64, gate: &'a utils::sync::gate::Gate, cancel: CancellationToken, - ) -> anyhow::Result { - Ok(Self { + ) -> Self { + Self { target_layer_size, inner: None, conf, @@ -336,10 +346,10 @@ impl<'a> SplitDeltaLayerWriter<'a> { tenant_shard_id, lsn_range, last_key_written: Key::MIN, - batches: BatchLayerWriter::new(conf).await?, + batches: BatchLayerWriter::new(conf), gate, cancel, - }) + } } pub async fn put_value( @@ -497,10 +507,7 @@ mod tests { 4 * 1024 * 1024, &tline.gate, tline.cancel.clone(), - &ctx, - ) - .await - .unwrap(); + ); let mut delta_writer = SplitDeltaLayerWriter::new( tenant.conf, @@ -510,9 +517,7 @@ mod tests { 4 * 1024 * 1024, &tline.gate, tline.cancel.clone(), - ) - .await - .unwrap(); + ); image_writer .put_image(get_key(0), get_img(0), &ctx) @@ -578,10 +583,7 @@ mod tests { 4 * 1024 * 1024, &tline.gate, tline.cancel.clone(), - &ctx, - ) - .await - .unwrap(); + ); let mut delta_writer = SplitDeltaLayerWriter::new( tenant.conf, tline.timeline_id, @@ -590,9 +592,7 @@ mod tests { 4 * 1024 * 1024, &tline.gate, tline.cancel.clone(), - ) - .await - .unwrap(); + ); const N: usize = 2000; for i in 0..N { let i = i as u32; @@ -679,10 +679,7 @@ mod tests { 4 * 1024, &tline.gate, tline.cancel.clone(), - &ctx, - ) - .await - .unwrap(); + ); let mut delta_writer = SplitDeltaLayerWriter::new( tenant.conf, @@ -692,9 +689,7 @@ mod tests { 4 * 1024, &tline.gate, tline.cancel.clone(), - ) - .await - .unwrap(); + ); image_writer .put_image(get_key(0), get_img(0), &ctx) @@ -770,9 +765,7 @@ mod tests { 4 * 1024 * 1024, &tline.gate, tline.cancel.clone(), - ) - .await - .unwrap(); + ); for i in 0..N { let i = i as u32; diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 954dd38bb4..e4890efdb0 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -406,6 +406,7 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { None, gc_horizon, tenant.get_pitr_interval(), + crate::tenant::IgnoreLeaseDeadline::Default, &cancel, &ctx, )) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c46a763688..6b1e747cb6 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -761,7 +761,7 @@ pub(crate) enum CreateImageLayersError { PageReconstructError(#[source] PageReconstructError), #[error(transparent)] - Other(#[from] anyhow::Error), + Other(anyhow::Error), } impl From for CreateImageLayersError { @@ -5586,7 +5586,7 @@ impl Timeline { self.should_check_if_image_layers_required(lsn) }; - let mut batch_image_writer = BatchLayerWriter::new(self.conf).await?; + let mut batch_image_writer = BatchLayerWriter::new(self.conf); let mut all_generated = true; @@ -5690,7 +5690,8 @@ impl Timeline { self.cancel.clone(), ctx, ) - .await?; + .await + .map_err(CreateImageLayersError::Other)?; fail_point!("image-layer-writer-fail-before-finish", |_| { Err(CreateImageLayersError::Other(anyhow::anyhow!( @@ -5785,7 +5786,10 @@ impl Timeline { } } - let image_layers = batch_image_writer.finish(self, ctx).await?; + let image_layers = batch_image_writer + .finish(self, ctx) + .await + .map_err(CreateImageLayersError::Other)?; let mut guard = self.layers.write(LayerManagerLockHolder::Compaction).await; diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index a7c117c221..a7bd89a326 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -3500,22 +3500,16 @@ impl Timeline { // Only create image layers when there is no ancestor branches. TODO: create covering image layer // when some condition meet. let mut image_layer_writer = if !has_data_below { - Some( - SplitImageLayerWriter::new( - self.conf, - self.timeline_id, - self.tenant_shard_id, - job_desc.compaction_key_range.start, - lowest_retain_lsn, - self.get_compaction_target_size(), - &self.gate, - self.cancel.clone(), - ctx, - ) - .await - .context("failed to create image layer writer") - .map_err(CompactionError::Other)?, - ) + Some(SplitImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + job_desc.compaction_key_range.start, + lowest_retain_lsn, + self.get_compaction_target_size(), + &self.gate, + self.cancel.clone(), + )) } else { None }; @@ -3528,10 +3522,7 @@ impl Timeline { self.get_compaction_target_size(), &self.gate, self.cancel.clone(), - ) - .await - .context("failed to create delta layer writer") - .map_err(CompactionError::Other)?; + ); #[derive(Default)] struct RewritingLayers { @@ -4327,7 +4318,8 @@ impl TimelineAdaptor { self.timeline.cancel.clone(), ctx, ) - .await?; + .await + .map_err(CreateImageLayersError::Other)?; fail_point!("image-layer-writer-fail-before-finish", |_| { Err(CreateImageLayersError::Other(anyhow::anyhow!( @@ -4336,7 +4328,10 @@ impl TimelineAdaptor { }); let keyspace = KeySpace { - ranges: self.get_keyspace(key_range, lsn, ctx).await?, + ranges: self + .get_keyspace(key_range, lsn, ctx) + .await + .map_err(CreateImageLayersError::Other)?, }; // TODO set proper (stateful) start. The create_image_layer_for_rel_blocks function mostly let outcome = self @@ -4355,9 +4350,13 @@ impl TimelineAdaptor { unfinished_image_layer, } = outcome { - let (desc, path) = unfinished_image_layer.finish(ctx).await?; + let (desc, path) = unfinished_image_layer + .finish(ctx) + .await + .map_err(CreateImageLayersError::Other)?; let image_layer = - Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)?; + Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path) + .map_err(CreateImageLayersError::Other)?; self.new_images.push(image_layer); } diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index f47ce5408b..f20a1343df 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -885,7 +885,7 @@ async fn remote_copy( } tracing::info!("Deleting orphan layer file to make way for hard linking"); // Delete orphan layer file and try again, to ensure this layer has a well understood source - std::fs::remove_file(adopted_path) + std::fs::remove_file(&adoptee_path) .map_err(|e| Error::launder(e.into(), Error::Prepare))?; std::fs::hard_link(adopted_path, &adoptee_path) .map_err(|e| Error::launder(e.into(), Error::Prepare))?; diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs index 809b350f38..2dbff20ab2 100644 --- a/pageserver/src/tenant/timeline/handle.rs +++ b/pageserver/src/tenant/timeline/handle.rs @@ -887,7 +887,7 @@ mod tests { .expect("we still have it"); } - fn make_relation_key_for_shard(shard: ShardNumber, params: &ShardParameters) -> Key { + fn make_relation_key_for_shard(shard: ShardNumber, params: ShardParameters) -> Key { rel_block_to_key( RelTag { spcnode: 1663, @@ -917,14 +917,14 @@ mod tests { let child0 = Arc::new_cyclic(|myself| StubTimeline { gate: Default::default(), id: timeline_id, - shard: ShardIdentity::from_params(ShardNumber(0), &child_params), + shard: ShardIdentity::from_params(ShardNumber(0), child_params), per_timeline_state: PerTimelineState::default(), myself: myself.clone(), }); let child1 = Arc::new_cyclic(|myself| StubTimeline { gate: Default::default(), id: timeline_id, - shard: ShardIdentity::from_params(ShardNumber(1), &child_params), + shard: ShardIdentity::from_params(ShardNumber(1), child_params), per_timeline_state: PerTimelineState::default(), myself: myself.clone(), }); @@ -937,7 +937,7 @@ mod tests { let handle = cache .get( timeline_id, - ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)), + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)), &StubManager { shards: vec![parent.clone()], }, @@ -961,7 +961,7 @@ mod tests { let handle = cache .get( timeline_id, - ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)), + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)), &StubManager { shards: vec![], // doesn't matter what's in here, the cache is fully loaded }, @@ -978,7 +978,7 @@ mod tests { let parent_handle = cache .get( timeline_id, - ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), &child_params)), + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(0), child_params)), &StubManager { shards: vec![parent.clone()], }, @@ -995,7 +995,7 @@ mod tests { let handle = cache .get( timeline_id, - ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), &child_params)), + ShardSelector::Page(make_relation_key_for_shard(ShardNumber(i), child_params)), &StubManager { shards: vec![child0.clone(), child1.clone()], // <====== this changed compared to previous loop }, diff --git a/proxy/src/error.rs b/proxy/src/error.rs index aa02b211d9..e880d63075 100644 --- a/proxy/src/error.rs +++ b/proxy/src/error.rs @@ -78,16 +78,6 @@ pub(crate) trait ReportableError: fmt::Display + Send + 'static { fn get_error_kind(&self) -> ErrorKind; } -impl ReportableError for postgres_client::error::Error { - fn get_error_kind(&self) -> ErrorKind { - if self.as_db_error().is_some() { - ErrorKind::Postgres - } else { - ErrorKind::Compute - } - } -} - /// Flattens `Result>` into `Result`. pub fn flatten_err(r: Result, JoinError>) -> anyhow::Result { r.context("join error").and_then(|x| x) diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs index 26269d0a6e..7708342ae3 100644 --- a/proxy/src/serverless/backend.rs +++ b/proxy/src/serverless/backend.rs @@ -404,7 +404,15 @@ impl ReportableError for HttpConnError { fn get_error_kind(&self) -> ErrorKind { match self { HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute, - HttpConnError::PostgresConnectionError(p) => p.get_error_kind(), + HttpConnError::PostgresConnectionError(p) => { + if p.as_db_error().is_some() { + // postgres rejected the connection + ErrorKind::Postgres + } else { + // couldn't even reach postgres + ErrorKind::Compute + } + } HttpConnError::LocalProxyConnectionError(_) => ErrorKind::Compute, HttpConnError::ComputeCtl(_) => ErrorKind::Service, HttpConnError::JwtPayloadError(_) => ErrorKind::User, diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index b2eb801f5c..5d5e7bf83e 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -22,7 +22,7 @@ use serde_json::Value; use serde_json::value::RawValue; use tokio::time::{self, Instant}; use tokio_util::sync::CancellationToken; -use tracing::{debug, error, info}; +use tracing::{Level, debug, error, info}; use typed_json::json; use url::Url; use uuid::Uuid; @@ -390,12 +390,35 @@ pub(crate) async fn handle( let line = get(db_error, |db| db.line().map(|l| l.to_string())); let routine = get(db_error, |db| db.routine()); - tracing::info!( - kind=error_kind.to_metric_label(), - error=%e, - msg=message, - "forwarding error to user" - ); + match &e { + SqlOverHttpError::Postgres(e) + if e.as_db_error().is_some() && error_kind == ErrorKind::User => + { + // this error contains too much info, and it's not an error we care about. + if tracing::enabled!(Level::DEBUG) { + tracing::debug!( + kind=error_kind.to_metric_label(), + error=%e, + msg=message, + "forwarding error to user" + ); + } else { + tracing::info!( + kind = error_kind.to_metric_label(), + error = "bad query", + "forwarding error to user" + ); + } + } + _ => { + tracing::info!( + kind=error_kind.to_metric_label(), + error=%e, + msg=message, + "forwarding error to user" + ); + } + } json_response( e.get_http_status_code(), @@ -460,7 +483,15 @@ impl ReportableError for SqlOverHttpError { SqlOverHttpError::ConnInfo(e) => e.get_error_kind(), SqlOverHttpError::ResponseTooLarge(_) => ErrorKind::User, SqlOverHttpError::InvalidIsolationLevel => ErrorKind::User, - SqlOverHttpError::Postgres(p) => p.get_error_kind(), + // customer initiated SQL errors. + SqlOverHttpError::Postgres(p) => { + if p.as_db_error().is_some() { + ErrorKind::User + } else { + ErrorKind::Compute + } + } + // proxy initiated SQL errors. SqlOverHttpError::InternalPostgres(p) => { if p.as_db_error().is_some() { ErrorKind::Service @@ -468,6 +499,7 @@ impl ReportableError for SqlOverHttpError { ErrorKind::Compute } } + // postgres returned a bad row format that we couldn't parse. SqlOverHttpError::JsonConversion(_) => ErrorKind::Postgres, SqlOverHttpError::Cancelled(c) => c.get_error_kind(), } diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index 2e46a7b529..b4bb193a4b 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -52,7 +52,7 @@ pub trait ResponseErrorMessageExt: Sized { impl ResponseErrorMessageExt for reqwest::Response { async fn error_from_body(self) -> Result { let status = self.status(); - if !(status.is_client_error() || status.is_server_error()) { + if status.is_success() { return Ok(self); } diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 5e7f1d8758..373589a18e 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -220,7 +220,7 @@ impl postgres_backend::Handler stripe_size: ShardStripeSize(stripe_size), }; self.shard = - Some(ShardIdentity::from_params(ShardNumber(number), ¶ms)); + Some(ShardIdentity::from_params(ShardNumber(number), params)); } _ => { return Err(QueryError::Other(anyhow::anyhow!( diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index b6cf73be2e..32624d260d 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -9,7 +9,7 @@ use anyhow::{Result, bail}; use postgres_ffi::WAL_SEGMENT_SIZE; use postgres_versioninfo::{PgMajorVersion, PgVersionId}; use safekeeper_api::membership::Configuration; -use safekeeper_api::models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse}; +use safekeeper_api::models::TimelineTermBumpResponse; use safekeeper_api::{INITIAL_TERM, ServerInfo, Term}; use serde::{Deserialize, Serialize}; use tracing::info; @@ -83,6 +83,11 @@ pub enum EvictionState { Offloaded(Lsn), } +pub struct MembershipSwitchResult { + pub previous_conf: Configuration, + pub current_conf: Configuration, +} + impl TimelinePersistentState { /// commit_lsn is the same as start_lsn in the normal creaiton; see /// `TimelineCreateRequest` comments.` @@ -261,10 +266,7 @@ where /// Switch into membership configuration `to` if it is higher than the /// current one. - pub async fn membership_switch( - &mut self, - to: Configuration, - ) -> Result { + pub async fn membership_switch(&mut self, to: Configuration) -> Result { let before = self.mconf.clone(); // Is switch allowed? if to.generation <= self.mconf.generation { @@ -278,7 +280,7 @@ where self.finish_change(&state).await?; info!("switched membership conf to {} from {}", to, before); } - Ok(TimelineMembershipSwitchResponse { + Ok(MembershipSwitchResult { previous_conf: before, current_conf: self.mconf.clone(), }) diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 37035c8759..cf38019f66 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -190,7 +190,14 @@ impl StateSK { &mut self, to: Configuration, ) -> Result { - self.state_mut().membership_switch(to).await + let result = self.state_mut().membership_switch(to).await?; + + Ok(TimelineMembershipSwitchResponse { + previous_conf: result.previous_conf, + current_conf: result.current_conf, + term: self.state().acceptor_state.term, + flush_lsn: self.flush_lsn(), + }) } /// Close open WAL files to release FDs. diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 33310706be..70e53d86ee 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -154,8 +154,8 @@ pub struct PhysicalStorage { /// record /// /// Partial segment 002 has no WAL records, and it will be removed by the - /// next truncate_wal(). This flag will be set to true after the first - /// truncate_wal() call. + /// next truncate_wal(). This flag will be set to false after the first + /// successful truncate_wal() call. /// /// [`write_lsn`]: Self::write_lsn pending_wal_truncation: bool, @@ -202,6 +202,8 @@ impl PhysicalStorage { ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn, ); if flush_lsn < state.commit_lsn { + // note: can never happen. find_end_of_wal returns provided start_lsn + // (state.commit_lsn in our case) if it doesn't find anything. bail!( "timeline {} potential data loss: flush_lsn {} by find_end_of_wal is less than commit_lsn {} from control file", ttid.timeline_id, diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 3a0806b3b2..143f4241f4 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -20,6 +20,7 @@ camino.workspace = true chrono.workspace = true clap.workspace = true clashmap.workspace = true +compute_api.workspace = true cron.workspace = true fail.workspace = true futures.workspace = true diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 0b5569b3d6..ab37a207e4 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -5,7 +5,8 @@ use std::sync::Arc; use std::time::Duration; use anyhow::Context; -use control_plane::endpoint::{ComputeControlPlane, EndpointStatus, PageserverProtocol}; +use compute_api::spec::PageserverProtocol; +use control_plane::endpoint::{ComputeControlPlane, EndpointStatus}; use control_plane::local_env::LocalEnv; use futures::StreamExt; use hyper::StatusCode; @@ -13,11 +14,12 @@ use pageserver_api::config::DEFAULT_GRPC_LISTEN_PORT; use pageserver_api::controller_api::AvailabilityZone; use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId}; use postgres_connection::parse_host_port; +use safekeeper_api::membership::SafekeeperGeneration; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; use tracing::{Instrument, info_span}; use utils::backoff::{self}; -use utils::id::{NodeId, TenantId}; +use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId}; use crate::service::Config; @@ -35,7 +37,7 @@ struct UnshardedComputeHookTenant { preferred_az: Option, // Must hold this lock to send a notification. - send_lock: Arc>>, + send_lock: Arc>>, } struct ShardedComputeHookTenant { stripe_size: ShardStripeSize, @@ -48,7 +50,7 @@ struct ShardedComputeHookTenant { // Must hold this lock to send a notification. The contents represent // the last successfully sent notification, and are used to coalesce multiple // updates by only sending when there is a chance since our last successful send. - send_lock: Arc>>, + send_lock: Arc>>, } /// Represents our knowledge of the compute's state: we can update this when we get a @@ -56,9 +58,9 @@ struct ShardedComputeHookTenant { /// /// Should be wrapped in an Option<>, as we cannot always know the remote state. #[derive(PartialEq, Eq, Debug)] -struct ComputeRemoteState { +struct ComputeRemoteState { // The request body which was acked by the compute - request: ComputeHookNotifyRequest, + request: R, // Whether the cplane indicated that the state was applied to running computes, or just // persisted. In the Neon control plane, this is the difference between a 423 response (meaning @@ -66,6 +68,36 @@ struct ComputeRemoteState { applied: bool, } +type ComputeRemoteTenantState = ComputeRemoteState; +type ComputeRemoteTimelineState = ComputeRemoteState; + +/// The trait which define the handler-specific types and methods. +/// We have two implementations of this trait so far: +/// - [`ComputeHookTenant`] for tenant attach notifications ("/notify-attach") +/// - [`ComputeHookTimeline`] for safekeeper change notifications ("/notify-safekeepers") +trait ApiMethod { + /// Type of the key which identifies the resource. + /// It's either TenantId for tenant attach notifications, + /// or TenantTimelineId for safekeeper change notifications. + type Key: std::cmp::Eq + std::hash::Hash + Clone; + + type Request: serde::Serialize + std::fmt::Debug; + + const API_PATH: &'static str; + + fn maybe_send( + &self, + key: Self::Key, + lock: Option>>>, + ) -> MaybeSendResult; + + async fn notify_local( + env: &LocalEnv, + cplane: &ComputeControlPlane, + req: &Self::Request, + ) -> Result<(), NotifyError>; +} + enum ComputeHookTenant { Unsharded(UnshardedComputeHookTenant), Sharded(ShardedComputeHookTenant), @@ -96,7 +128,7 @@ impl ComputeHookTenant { } } - fn get_send_lock(&self) -> &Arc>> { + fn get_send_lock(&self) -> &Arc>> { match self { Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock, Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock, @@ -190,19 +222,136 @@ impl ComputeHookTenant { } } +/// The state of a timeline we need to notify the compute about. +struct ComputeHookTimeline { + generation: SafekeeperGeneration, + safekeepers: Vec, + + send_lock: Arc>>, +} + +impl ComputeHookTimeline { + /// Construct a new ComputeHookTimeline with the given safekeepers and generation. + fn new(generation: SafekeeperGeneration, safekeepers: Vec) -> Self { + Self { + generation, + safekeepers, + send_lock: Arc::default(), + } + } + + /// Update the state with a new SafekeepersUpdate. + /// Noop if the update generation is not greater than the current generation. + fn update(&mut self, sk_update: SafekeepersUpdate) { + if sk_update.generation > self.generation { + self.generation = sk_update.generation; + self.safekeepers = sk_update.safekeepers; + } + } +} + +impl ApiMethod for ComputeHookTimeline { + type Key = TenantTimelineId; + type Request = NotifySafekeepersRequest; + + const API_PATH: &'static str = "notify-safekeepers"; + + fn maybe_send( + &self, + ttid: TenantTimelineId, + lock: Option>>, + ) -> MaybeSendNotifySafekeepersResult { + let locked = match lock { + Some(already_locked) => already_locked, + None => { + // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::timelines`] lock. + let Ok(locked) = self.send_lock.clone().try_lock_owned() else { + return MaybeSendResult::AwaitLock((ttid, self.send_lock.clone())); + }; + locked + } + }; + + if locked + .as_ref() + .is_some_and(|s| s.request.generation >= self.generation) + { + return MaybeSendResult::Noop; + } + + MaybeSendResult::Transmit(( + NotifySafekeepersRequest { + tenant_id: ttid.tenant_id, + timeline_id: ttid.timeline_id, + generation: self.generation, + safekeepers: self.safekeepers.clone(), + }, + locked, + )) + } + + async fn notify_local( + _env: &LocalEnv, + cplane: &ComputeControlPlane, + req: &NotifySafekeepersRequest, + ) -> Result<(), NotifyError> { + let NotifySafekeepersRequest { + tenant_id, + timeline_id, + generation, + safekeepers, + } = req; + + for (endpoint_name, endpoint) in &cplane.endpoints { + if endpoint.tenant_id == *tenant_id + && endpoint.timeline_id == *timeline_id + && endpoint.status() == EndpointStatus::Running + { + tracing::info!("Reconfiguring safekeepers for endpoint {endpoint_name}"); + + let safekeepers = safekeepers.iter().map(|sk| sk.id).collect::>(); + + endpoint + .reconfigure_safekeepers(safekeepers, *generation) + .await + .map_err(NotifyError::NeonLocal)?; + } + } + + Ok(()) + } +} + #[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] -struct ComputeHookNotifyRequestShard { +struct NotifyAttachRequestShard { node_id: NodeId, shard_number: ShardNumber, } /// Request body that we send to the control plane to notify it of where a tenant is attached #[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] -struct ComputeHookNotifyRequest { +struct NotifyAttachRequest { tenant_id: TenantId, preferred_az: Option, stripe_size: Option, - shards: Vec, + shards: Vec, +} + +#[derive(Serialize, Deserialize, Debug, Eq, PartialEq, Clone)] +pub(crate) struct SafekeeperInfo { + pub id: NodeId, + /// Hostname of the safekeeper. + /// It exists for better debuggability. Might be missing. + /// Should not be used for anything else. + pub hostname: Option, +} + +#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)] +struct NotifySafekeepersRequest { + tenant_id: TenantId, + timeline_id: TimelineId, + generation: SafekeeperGeneration, + safekeepers: Vec, } /// Error type for attempts to call into the control plane compute notification hook @@ -234,42 +383,50 @@ pub(crate) enum NotifyError { NeonLocal(anyhow::Error), } -enum MaybeSendResult { +enum MaybeSendResult { // Please send this request while holding the lock, and if you succeed then write // the request into the lock. Transmit( ( - ComputeHookNotifyRequest, - tokio::sync::OwnedMutexGuard>, + R, + tokio::sync::OwnedMutexGuard>>, ), ), // Something requires sending, but you must wait for a current sender then call again - AwaitLock(Arc>>), + AwaitLock((K, Arc>>>)), // Nothing requires sending Noop, } -impl ComputeHookTenant { +type MaybeSendNotifyAttachResult = MaybeSendResult; +type MaybeSendNotifySafekeepersResult = MaybeSendResult; + +impl ApiMethod for ComputeHookTenant { + type Key = TenantId; + type Request = NotifyAttachRequest; + + const API_PATH: &'static str = "notify-attach"; + fn maybe_send( &self, tenant_id: TenantId, - lock: Option>>, - ) -> MaybeSendResult { + lock: Option>>, + ) -> MaybeSendNotifyAttachResult { let locked = match lock { Some(already_locked) => already_locked, None => { - // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock. + // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::tenants`] lock. let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else { - return MaybeSendResult::AwaitLock(self.get_send_lock().clone()); + return MaybeSendResult::AwaitLock((tenant_id, self.get_send_lock().clone())); }; locked } }; let request = match self { - Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest { + Self::Unsharded(unsharded_tenant) => Some(NotifyAttachRequest { tenant_id, - shards: vec![ComputeHookNotifyRequestShard { + shards: vec![NotifyAttachRequestShard { shard_number: ShardNumber(0), node_id: unsharded_tenant.node_id, }], @@ -282,12 +439,12 @@ impl ComputeHookTenant { Self::Sharded(sharded_tenant) if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize => { - Some(ComputeHookNotifyRequest { + Some(NotifyAttachRequest { tenant_id, shards: sharded_tenant .shards .iter() - .map(|(shard_number, node_id)| ComputeHookNotifyRequestShard { + .map(|(shard_number, node_id)| NotifyAttachRequestShard { shard_number: *shard_number, node_id: *node_id, }) @@ -332,98 +489,22 @@ impl ComputeHookTenant { } } } -} -/// The compute hook is a destination for notifications about changes to tenant:pageserver -/// mapping. It aggregates updates for the shards in a tenant, and when appropriate reconfigures -/// the compute connection string. -pub(super) struct ComputeHook { - config: Config, - state: std::sync::Mutex>, - authorization_header: Option, - - // Concurrency limiter, so that we do not overload the cloud control plane when updating - // large numbers of tenants (e.g. when failing over after a node failure) - api_concurrency: tokio::sync::Semaphore, - - // This lock is only used in testing enviroments, to serialize calls into neon_lock - neon_local_lock: tokio::sync::Mutex<()>, - - // We share a client across all notifications to enable connection re-use etc when - // sending large numbers of notifications - client: reqwest::Client, -} - -/// Callers may give us a list of these when asking us to send a bulk batch -/// of notifications in the background. This is a 'notification' in the sense of -/// other code notifying us of a shard's status, rather than being the final notification -/// that we send upwards to the control plane for the whole tenant. -pub(crate) struct ShardUpdate<'a> { - pub(crate) tenant_shard_id: TenantShardId, - pub(crate) node_id: NodeId, - pub(crate) stripe_size: ShardStripeSize, - pub(crate) preferred_az: Option>, -} - -impl ComputeHook { - pub(super) fn new(config: Config) -> anyhow::Result { - let authorization_header = config - .control_plane_jwt_token - .clone() - .map(|jwt| format!("Bearer {jwt}")); - - let mut client = reqwest::ClientBuilder::new().timeout(NOTIFY_REQUEST_TIMEOUT); - for cert in &config.ssl_ca_certs { - client = client.add_root_certificate(cert.clone()); - } - let client = client - .build() - .context("Failed to build http client for compute hook")?; - - Ok(Self { - state: Default::default(), - config, - authorization_header, - neon_local_lock: Default::default(), - api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY), - client, - }) - } - - /// For test environments: use neon_local's LocalEnv to update compute - async fn do_notify_local( - &self, - reconfigure_request: &ComputeHookNotifyRequest, + async fn notify_local( + env: &LocalEnv, + cplane: &ComputeControlPlane, + req: &NotifyAttachRequest, ) -> Result<(), NotifyError> { - // neon_local updates are not safe to call concurrently, use a lock to serialize - // all calls to this function - let _locked = self.neon_local_lock.lock().await; - - let Some(repo_dir) = self.config.neon_local_repo_dir.as_deref() else { - tracing::warn!( - "neon_local_repo_dir not set, likely a bug in neon_local; skipping compute update" - ); - return Ok(()); - }; - let env = match LocalEnv::load_config(repo_dir) { - Ok(e) => e, - Err(e) => { - tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})"); - return Ok(()); - } - }; - let cplane = - ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane"); - let ComputeHookNotifyRequest { + let NotifyAttachRequest { tenant_id, shards, stripe_size, preferred_az: _preferred_az, - } = reconfigure_request; + } = req; for (endpoint_name, endpoint) in &cplane.endpoints { if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running { - tracing::info!("Reconfiguring endpoint {endpoint_name}"); + tracing::info!("Reconfiguring pageservers for endpoint {endpoint_name}"); let pageservers = shards .iter() @@ -445,7 +526,7 @@ impl ComputeHook { .collect::>(); endpoint - .reconfigure(pageservers, *stripe_size, None) + .reconfigure_pageservers(pageservers, *stripe_size) .await .map_err(NotifyError::NeonLocal)?; } @@ -453,11 +534,102 @@ impl ComputeHook { Ok(()) } +} - async fn do_notify_iteration( +/// The compute hook is a destination for notifications about changes to tenant:pageserver +/// mapping. It aggregates updates for the shards in a tenant, and when appropriate reconfigures +/// the compute connection string. +pub(super) struct ComputeHook { + config: Config, + tenants: std::sync::Mutex>, + timelines: std::sync::Mutex>, + authorization_header: Option, + + // Concurrency limiter, so that we do not overload the cloud control plane when updating + // large numbers of tenants (e.g. when failing over after a node failure) + api_concurrency: tokio::sync::Semaphore, + + // This lock is only used in testing enviroments, to serialize calls into neon_local + neon_local_lock: tokio::sync::Mutex<()>, + + // We share a client across all notifications to enable connection re-use etc when + // sending large numbers of notifications + client: reqwest::Client, +} + +/// Callers may give us a list of these when asking us to send a bulk batch +/// of notifications in the background. This is a 'notification' in the sense of +/// other code notifying us of a shard's status, rather than being the final notification +/// that we send upwards to the control plane for the whole tenant. +pub(crate) struct ShardUpdate<'a> { + pub(crate) tenant_shard_id: TenantShardId, + pub(crate) node_id: NodeId, + pub(crate) stripe_size: ShardStripeSize, + pub(crate) preferred_az: Option>, +} + +pub(crate) struct SafekeepersUpdate { + pub(crate) tenant_id: TenantId, + pub(crate) timeline_id: TimelineId, + pub(crate) generation: SafekeeperGeneration, + pub(crate) safekeepers: Vec, +} + +impl ComputeHook { + pub(super) fn new(config: Config) -> anyhow::Result { + let authorization_header = config + .control_plane_jwt_token + .clone() + .map(|jwt| format!("Bearer {jwt}")); + + let mut client = reqwest::ClientBuilder::new().timeout(NOTIFY_REQUEST_TIMEOUT); + for cert in &config.ssl_ca_certs { + client = client.add_root_certificate(cert.clone()); + } + let client = client + .build() + .context("Failed to build http client for compute hook")?; + + Ok(Self { + tenants: Default::default(), + timelines: Default::default(), + config, + authorization_header, + neon_local_lock: Default::default(), + api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY), + client, + }) + } + + /// For test environments: use neon_local's LocalEnv to update compute + async fn do_notify_local(&self, req: &M::Request) -> Result<(), NotifyError> { + // neon_local updates are not safe to call concurrently, use a lock to serialize + // all calls to this function + let _locked = self.neon_local_lock.lock().await; + + let Some(repo_dir) = self.config.neon_local_repo_dir.as_deref() else { + tracing::warn!( + "neon_local_repo_dir not set, likely a bug in neon_local; skipping compute update" + ); + return Ok(()); + }; + let env = match LocalEnv::load_config(repo_dir) { + Ok(e) => e, + Err(e) => { + tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})"); + return Ok(()); + } + }; + let cplane = + ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane"); + + M::notify_local(&env, &cplane, req).await + } + + async fn do_notify_iteration( &self, url: &String, - reconfigure_request: &ComputeHookNotifyRequest, + reconfigure_request: &Req, cancel: &CancellationToken, ) -> Result<(), NotifyError> { let req = self.client.request(reqwest::Method::PUT, url); @@ -479,9 +651,7 @@ impl ComputeHook { }; // Treat all 2xx responses as success - if response.status() >= reqwest::StatusCode::OK - && response.status() < reqwest::StatusCode::MULTIPLE_CHOICES - { + if response.status().is_success() { if response.status() != reqwest::StatusCode::OK { // Non-200 2xx response: it doesn't make sense to retry, but this is unexpected, so // log a warning. @@ -532,10 +702,10 @@ impl ComputeHook { } } - async fn do_notify( + async fn do_notify( &self, url: &String, - reconfigure_request: &ComputeHookNotifyRequest, + reconfigure_request: &R, cancel: &CancellationToken, ) -> Result<(), NotifyError> { // We hold these semaphore units across all retries, rather than only across each @@ -567,13 +737,13 @@ impl ComputeHook { } /// Synchronous phase: update the per-tenant state for the next intended notification - fn notify_prepare(&self, shard_update: ShardUpdate) -> MaybeSendResult { - let mut state_locked = self.state.lock().unwrap(); + fn notify_attach_prepare(&self, shard_update: ShardUpdate) -> MaybeSendNotifyAttachResult { + let mut tenants_locked = self.tenants.lock().unwrap(); use std::collections::hash_map::Entry; let tenant_shard_id = shard_update.tenant_shard_id; - let tenant = match state_locked.entry(tenant_shard_id.tenant_id) { + let tenant = match tenants_locked.entry(tenant_shard_id.tenant_id) { Entry::Vacant(e) => { let ShardUpdate { tenant_shard_id, @@ -597,10 +767,37 @@ impl ComputeHook { tenant.maybe_send(tenant_shard_id.tenant_id, None) } - async fn notify_execute( + fn notify_safekeepers_prepare( &self, - maybe_send_result: MaybeSendResult, - tenant_shard_id: TenantShardId, + safekeepers_update: SafekeepersUpdate, + ) -> MaybeSendNotifySafekeepersResult { + let mut timelines_locked = self.timelines.lock().unwrap(); + + let ttid = TenantTimelineId { + tenant_id: safekeepers_update.tenant_id, + timeline_id: safekeepers_update.timeline_id, + }; + + use std::collections::hash_map::Entry; + let timeline = match timelines_locked.entry(ttid) { + Entry::Vacant(e) => e.insert(ComputeHookTimeline::new( + safekeepers_update.generation, + safekeepers_update.safekeepers, + )), + Entry::Occupied(e) => { + let timeline = e.into_mut(); + timeline.update(safekeepers_update); + timeline + } + }; + + timeline.maybe_send(ttid, None) + } + + async fn notify_execute( + &self, + state: &std::sync::Mutex>, + maybe_send_result: MaybeSendResult, cancel: &CancellationToken, ) -> Result<(), NotifyError> { // Process result: we may get an update to send, or we may have to wait for a lock @@ -609,7 +806,7 @@ impl ComputeHook { MaybeSendResult::Noop => { return Ok(()); } - MaybeSendResult::AwaitLock(send_lock) => { + MaybeSendResult::AwaitLock((key, send_lock)) => { let send_locked = tokio::select! { guard = send_lock.lock_owned() => {guard}, _ = cancel.cancelled() => { @@ -620,11 +817,11 @@ impl ComputeHook { // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here // we have acquired the send lock and take `[Self::state]` lock. This is safe because maybe_send only uses // try_lock. - let state_locked = self.state.lock().unwrap(); - let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else { + let state_locked = state.lock().unwrap(); + let Some(resource_state) = state_locked.get(&key) else { return Ok(()); }; - match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) { + match resource_state.maybe_send(key, Some(send_locked)) { MaybeSendResult::AwaitLock(_) => { unreachable!("We supplied lock guard") } @@ -643,14 +840,18 @@ impl ComputeHook { .control_plane_url .as_ref() .map(|control_plane_url| { - format!("{}/notify-attach", control_plane_url.trim_end_matches('/')) + format!( + "{}/{}", + control_plane_url.trim_end_matches('/'), + M::API_PATH + ) }); // We validate this at startup let notify_url = compute_hook_url.as_ref().unwrap(); self.do_notify(notify_url, &request, cancel).await } else { - self.do_notify_local(&request).await.map_err(|e| { + self.do_notify_local::(&request).await.map_err(|e| { // This path is for testing only, so munge the error into our prod-style error type. tracing::error!("neon_local notification hook failed: {e}"); NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR) @@ -686,7 +887,7 @@ impl ComputeHook { /// Infallible synchronous fire-and-forget version of notify(), that sends its results to /// a channel. Something should consume the channel and arrange to try notifying again /// if something failed. - pub(super) fn notify_background( + pub(super) fn notify_attach_background( self: &Arc, notifications: Vec, result_tx: tokio::sync::mpsc::Sender>, @@ -695,7 +896,7 @@ impl ComputeHook { let mut maybe_sends = Vec::new(); for shard_update in notifications { let tenant_shard_id = shard_update.tenant_shard_id; - let maybe_send_result = self.notify_prepare(shard_update); + let maybe_send_result = self.notify_attach_prepare(shard_update); maybe_sends.push((tenant_shard_id, maybe_send_result)) } @@ -714,10 +915,10 @@ impl ComputeHook { async move { this - .notify_execute(maybe_send_result, tenant_shard_id, &cancel) + .notify_execute(&this.tenants, maybe_send_result, &cancel) .await.map_err(|e| (tenant_shard_id, e)) }.instrument(info_span!( - "notify_background", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug() + "notify_attach_background", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug() )) }) .buffered(API_CONCURRENCY); @@ -760,14 +961,23 @@ impl ComputeHook { /// ensuring that they eventually call again to ensure that the compute is eventually notified of /// the proper pageserver nodes for a tenant. #[tracing::instrument(skip_all, fields(tenant_id=%shard_update.tenant_shard_id.tenant_id, shard_id=%shard_update.tenant_shard_id.shard_slug(), node_id))] - pub(super) async fn notify<'a>( + pub(super) async fn notify_attach<'a>( &self, shard_update: ShardUpdate<'a>, cancel: &CancellationToken, ) -> Result<(), NotifyError> { - let tenant_shard_id = shard_update.tenant_shard_id; - let maybe_send_result = self.notify_prepare(shard_update); - self.notify_execute(maybe_send_result, tenant_shard_id, cancel) + let maybe_send_result = self.notify_attach_prepare(shard_update); + self.notify_execute(&self.tenants, maybe_send_result, cancel) + .await + } + + pub(super) async fn notify_safekeepers( + &self, + safekeepers_update: SafekeepersUpdate, + cancel: &CancellationToken, + ) -> Result<(), NotifyError> { + let maybe_send_result = self.notify_safekeepers_prepare(safekeepers_update); + self.notify_execute(&self.timelines, maybe_send_result, cancel) .await } @@ -783,8 +993,8 @@ impl ComputeHook { ) { use std::collections::hash_map::Entry; - let mut state_locked = self.state.lock().unwrap(); - match state_locked.entry(tenant_shard_id.tenant_id) { + let mut tenants_locked = self.tenants.lock().unwrap(); + match tenants_locked.entry(tenant_shard_id.tenant_id) { Entry::Vacant(_) => { // This is a valid but niche case, where the tenant was previously attached // as a Secondary location and then detached, so has no previously notified diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index a7e86b5224..66c44b5674 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -22,7 +22,7 @@ use pageserver_api::controller_api::{ MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse, NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, SafekeeperSchedulingPolicyRequest, ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest, - TimelineImportRequest, + TimelineImportRequest, TimelineSafekeeperMigrateRequest, }; use pageserver_api::models::{ DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest, @@ -34,6 +34,7 @@ use pageserver_api::upcall_api::{ PutTimelineImportStatusRequest, ReAttachRequest, TimelineImportStatusRequest, ValidateRequest, }; use pageserver_client::{BlockUnblock, mgmt_api}; + use routerify::Middleware; use tokio_util::sync::CancellationToken; use tracing::warn; @@ -635,6 +636,32 @@ async fn handle_tenant_timeline_download_heatmap_layers( json_response(StatusCode::OK, ()) } +async fn handle_tenant_timeline_safekeeper_migrate( + service: Arc, + req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + maybe_rate_limit(&req, tenant_id).await; + + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let migrate_req = json_request::(&mut req).await?; + + service + .tenant_timeline_safekeeper_migrate(tenant_id, timeline_id, migrate_req) + .await?; + + json_response(StatusCode::OK, ()) +} + async fn handle_tenant_timeline_lsn_lease( service: Arc, req: Request, @@ -2458,6 +2485,16 @@ pub fn make_router( ) }, ) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_safekeeper_migrate, + RequestName("v1_tenant_timeline_safekeeper_migrate"), + ) + }, + ) // LSN lease passthrough to all shards .post( "/v1/tenant/:tenant_id/timeline/:timeline_id/lsn_lease", diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index d1c2858d6f..2a851dc25b 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -6,9 +6,7 @@ use std::time::Duration; use anyhow::{Context, anyhow}; use camino::Utf8PathBuf; -#[cfg(feature = "testing")] -use clap::ArgAction; -use clap::Parser; +use clap::{ArgAction, Parser}; use futures::future::OptionFuture; use http_utils::tls_certs::ReloadingCertificateResolver; use hyper0::Uri; @@ -216,14 +214,13 @@ struct Cli { /// Number of safekeepers to choose for a timeline when creating it. /// Safekeepers will be choosen from different availability zones. /// This option exists primarily for testing purposes. - #[arg(long, default_value = "3", value_parser = clap::value_parser!(i64).range(1..))] - timeline_safekeeper_count: i64, + #[arg(long, default_value = "3", value_parser = clap::builder::RangedU64ValueParser::::new().range(1..))] + timeline_safekeeper_count: usize, /// When set, actively checks and initiates heatmap downloads/uploads during reconciliation. /// This speed up migrations by avoiding the default wait for the heatmap download interval. /// Primarily useful for testing to reduce test execution time. - #[cfg(feature = "testing")] - #[arg(long, default_value = "true", action=ArgAction::Set)] + #[arg(long, default_value = "false", action=ArgAction::Set)] kick_secondary_downloads: bool, } @@ -472,7 +469,6 @@ async fn async_main() -> anyhow::Result<()> { use_local_compute_notifications: args.use_local_compute_notifications, timeline_safekeeper_count: args.timeline_safekeeper_count, posthog_config: posthog_config.clone(), - #[cfg(feature = "testing")] kick_secondary_downloads: args.kick_secondary_downloads, }; diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 07713c3fbc..f7f77cdd23 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -333,6 +333,7 @@ pub(crate) enum DatabaseErrorLabel { ConnectionPool, Logical, Migration, + Cas, } impl DatabaseError { @@ -343,6 +344,7 @@ impl DatabaseError { Self::ConnectionPool(_) => DatabaseErrorLabel::ConnectionPool, Self::Logical(_) => DatabaseErrorLabel::Logical, Self::Migration(_) => DatabaseErrorLabel::Migration, + Self::Cas(_) => DatabaseErrorLabel::Cas, } } } diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 2948e9019f..56f4d03111 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -29,6 +29,7 @@ use pageserver_api::shard::{ use rustls::client::WebPkiServerVerifier; use rustls::client::danger::{ServerCertVerified, ServerCertVerifier}; use rustls::crypto::ring; +use safekeeper_api::membership::SafekeeperGeneration; use scoped_futures::ScopedBoxFuture; use serde::{Deserialize, Serialize}; use utils::generation::Generation; @@ -94,6 +95,8 @@ pub(crate) enum DatabaseError { Logical(String), #[error("Migration error: {0}")] Migration(String), + #[error("CAS error: {0}")] + Cas(String), } #[derive(measured::FixedCardinalityLabel, Copy, Clone)] @@ -126,6 +129,7 @@ pub(crate) enum DatabaseOperation { UpdateLeader, SetPreferredAzs, InsertTimeline, + UpdateTimelineMembership, GetTimeline, InsertTimelineReconcile, RemoveTimelineReconcile, @@ -1410,6 +1414,56 @@ impl Persistence { .await } + /// Update timeline membership configuration in the database. + /// Perform a compare-and-swap (CAS) operation on the timeline's generation. + /// The `new_generation` must be the next (+1) generation after the one in the database. + pub(crate) async fn update_timeline_membership( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + new_generation: SafekeeperGeneration, + sk_set: &[NodeId], + new_sk_set: Option<&[NodeId]>, + ) -> DatabaseResult<()> { + use crate::schema::timelines::dsl; + + let prev_generation = new_generation.previous().unwrap(); + + let tenant_id = &tenant_id; + let timeline_id = &timeline_id; + self.with_measured_conn(DatabaseOperation::UpdateTimelineMembership, move |conn| { + Box::pin(async move { + let updated = diesel::update(dsl::timelines) + .filter(dsl::tenant_id.eq(&tenant_id.to_string())) + .filter(dsl::timeline_id.eq(&timeline_id.to_string())) + .filter(dsl::generation.eq(prev_generation.into_inner() as i32)) + .set(( + dsl::generation.eq(new_generation.into_inner() as i32), + dsl::sk_set.eq(sk_set.iter().map(|id| id.0 as i64).collect::>()), + dsl::new_sk_set.eq(new_sk_set + .map(|set| set.iter().map(|id| id.0 as i64).collect::>())), + )) + .execute(conn) + .await?; + + match updated { + 0 => { + // TODO(diko): It makes sense to select the current generation + // and include it in the error message for better debuggability. + Err(DatabaseError::Cas( + "Failed to update membership configuration".to_string(), + )) + } + 1 => Ok(()), + _ => Err(DatabaseError::Logical(format!( + "unexpected number of rows ({updated})" + ))), + } + }) + }) + .await + } + /// Load timeline from db. Returns `None` if not present. pub(crate) async fn get_timeline( &self, diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 92844c9c7b..a2fba0fa56 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -65,7 +65,7 @@ pub(super) struct Reconciler { pub(crate) compute_hook: Arc, /// To avoid stalling if the cloud control plane is unavailable, we may proceed - /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed + /// past failures in [`ComputeHook::notify_attach`], but we _must_ remember that we failed /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry. pub(crate) compute_notify_failure: bool, @@ -1023,7 +1023,7 @@ impl Reconciler { if let Some(node) = &self.intent.attached { let result = self .compute_hook - .notify( + .notify_attach( compute_hook::ShardUpdate { tenant_shard_id: self.tenant_shard_id, node_id: node.get_id(), diff --git a/storage_controller/src/safekeeper.rs b/storage_controller/src/safekeeper.rs index 5a13ef750e..91154f4fa3 100644 --- a/storage_controller/src/safekeeper.rs +++ b/storage_controller/src/safekeeper.rs @@ -2,6 +2,7 @@ use std::time::Duration; use pageserver_api::controller_api::{SafekeeperDescribeResponse, SkSchedulingPolicy}; use reqwest::StatusCode; +use safekeeper_api::membership::SafekeeperId; use safekeeper_client::mgmt_api; use tokio_util::sync::CancellationToken; use utils::backoff; @@ -92,6 +93,13 @@ impl Safekeeper { pub(crate) fn has_https_port(&self) -> bool { self.listen_https_port.is_some() } + pub(crate) fn get_safekeeper_id(&self) -> SafekeeperId { + SafekeeperId { + id: self.id, + host: self.skp.host.clone(), + pg_port: self.skp.port as u16, + } + } /// Perform an operation (which is given a [`SafekeeperClient`]) with retries #[allow(clippy::too_many_arguments)] pub(crate) async fn with_client_retries( diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs index bcf223c731..47a785e7d3 100644 --- a/storage_controller/src/safekeeper_client.rs +++ b/storage_controller/src/safekeeper_client.rs @@ -56,6 +56,10 @@ impl SafekeeperClient { } } + pub(crate) fn node_id_label(&self) -> &str { + &self.node_id_label + } + pub(crate) async fn create_timeline( &self, req: &TimelineCreateRequest, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index b4dfd01249..e4c494db8f 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -161,6 +161,7 @@ enum TenantOperations { DropDetached, DownloadHeatmapLayers, TimelineLsnLease, + TimelineSafekeeperMigrate, } #[derive(Clone, strum_macros::Display)] @@ -471,12 +472,12 @@ pub struct Config { /// Number of safekeepers to choose for a timeline when creating it. /// Safekeepers will be choosen from different availability zones. - pub timeline_safekeeper_count: i64, + pub timeline_safekeeper_count: usize, /// PostHog integration config pub posthog_config: Option, - #[cfg(feature = "testing")] + /// When set, actively checks and initiates heatmap downloads/uploads. pub kick_secondary_downloads: bool, } @@ -491,6 +492,7 @@ impl From for ApiError { DatabaseError::Logical(reason) | DatabaseError::Migration(reason) => { ApiError::InternalServerError(anyhow::anyhow!(reason)) } + DatabaseError::Cas(reason) => ApiError::Conflict(reason), } } } @@ -876,18 +878,18 @@ impl Service { // Emit compute hook notifications for all tenants which are already stably attached. Other tenants // will emit compute hook notifications when they reconcile. // - // Ordering: our calls to notify_background synchronously establish a relative order for these notifications vs. any later + // Ordering: our calls to notify_attach_background synchronously establish a relative order for these notifications vs. any later // calls into the ComputeHook for the same tenant: we can leave these to run to completion in the background and any later // calls will be correctly ordered wrt these. // - // Concurrency: we call notify_background for all tenants, which will create O(N) tokio tasks, but almost all of them + // Concurrency: we call notify_attach_background for all tenants, which will create O(N) tokio tasks, but almost all of them // will just wait on the ComputeHook::API_CONCURRENCY semaphore immediately, so very cheap until they get that semaphore // unit and start doing I/O. tracing::info!( "Sending {} compute notifications", compute_notifications.len() ); - self.compute_hook.notify_background( + self.compute_hook.notify_attach_background( compute_notifications, bg_compute_notify_result_tx.clone(), &self.cancel, @@ -2582,7 +2584,7 @@ impl Service { .do_initial_shard_scheduling( tenant_shard_id, initial_generation, - &create_req.shard_parameters, + create_req.shard_parameters, create_req.config.clone(), placement_policy.clone(), preferred_az_id.as_ref(), @@ -2639,7 +2641,7 @@ impl Service { &self, tenant_shard_id: TenantShardId, initial_generation: Option, - shard_params: &ShardParameters, + shard_params: ShardParameters, config: TenantConfig, placement_policy: PlacementPolicy, preferred_az_id: Option<&AvailabilityZone>, @@ -6279,7 +6281,7 @@ impl Service { for (child_id, child_ps, stripe_size) in child_locations { if let Err(e) = self .compute_hook - .notify( + .notify_attach( compute_hook::ShardUpdate { tenant_shard_id: child_id, node_id: child_ps, @@ -8364,7 +8366,6 @@ impl Service { "Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}" ); - #[cfg(feature = "testing")] if progress.heatmap_mtime.is_none() { // No heatmap might mean the attached location has never uploaded one, or that // the secondary download hasn't happened yet. This is relatively unusual in the field, @@ -8389,7 +8390,6 @@ impl Service { /// happens on multi-minute timescales in the field, which is fine because optimisation is meant /// to be a lazy background thing. However, when testing, it is not practical to wait around, so /// we have this helper to move things along faster. - #[cfg(feature = "testing")] async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) { if !self.config.kick_secondary_downloads { // No-op if kick_secondary_downloads functionaliuty is not configured diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs index a3c5082be6..b67a679fad 100644 --- a/storage_controller/src/service/safekeeper_reconciler.rs +++ b/storage_controller/src/service/safekeeper_reconciler.rs @@ -145,7 +145,7 @@ pub(crate) async fn load_schedule_requests( } let Some(sk) = safekeepers.get(&other_node_id) else { tracing::warn!( - "couldnt find safekeeper with pending op id {other_node_id}, not pulling from it" + "couldn't find safekeeper with pending op id {other_node_id}, not pulling from it" ); return None; }; diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index fec81fb661..cf48b007b2 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -1,25 +1,34 @@ -use std::cmp::max; use std::collections::HashSet; use std::str::FromStr; use std::sync::Arc; use std::time::Duration; use super::safekeeper_reconciler::ScheduleRequest; +use crate::compute_hook; use crate::heartbeater::SafekeeperState; +use crate::id_lock_map::trace_shared_lock; use crate::metrics; use crate::persistence::{ DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence, }; use crate::safekeeper::Safekeeper; +use crate::safekeeper_client::SafekeeperClient; +use crate::service::TenantOperations; use crate::timeline_import::TimelineImportFinalizeError; use anyhow::Context; use http_utils::error::ApiError; use pageserver_api::controller_api::{ SafekeeperDescribeResponse, SkSchedulingPolicy, TimelineImportRequest, + TimelineSafekeeperMigrateRequest, }; use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo}; use safekeeper_api::PgVersionId; -use safekeeper_api::membership::{MemberSet, SafekeeperGeneration, SafekeeperId}; +use safekeeper_api::membership::{self, MemberSet, SafekeeperGeneration}; +use safekeeper_api::models::{ + PullTimelineRequest, TimelineMembershipSwitchRequest, TimelineMembershipSwitchResponse, +}; +use safekeeper_api::{INITIAL_TERM, Term}; +use safekeeper_client::mgmt_api; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use utils::id::{NodeId, TenantId, TimelineId}; @@ -36,6 +45,33 @@ pub struct TimelineLocateResponse { } impl Service { + fn make_member_set(safekeepers: &[Safekeeper]) -> Result { + let members = safekeepers + .iter() + .map(|sk| sk.get_safekeeper_id()) + .collect::>(); + + MemberSet::new(members).map_err(ApiError::InternalServerError) + } + + fn get_safekeepers(&self, ids: &[i64]) -> Result, ApiError> { + let safekeepers = { + let locked = self.inner.read().unwrap(); + locked.safekeepers.clone() + }; + + ids.iter() + .map(|&id| { + let node_id = NodeId(id as u64); + safekeepers.get(&node_id).cloned().ok_or_else(|| { + ApiError::InternalServerError(anyhow::anyhow!( + "safekeeper {node_id} is not registered" + )) + }) + }) + .collect::, _>>() + } + /// Timeline creation on safekeepers /// /// Returns `Ok(left)` if the timeline has been created on a quorum of safekeepers, @@ -48,35 +84,9 @@ impl Service { pg_version: PgVersionId, timeline_persistence: &TimelinePersistence, ) -> Result, ApiError> { - // If quorum is reached, return if we are outside of a specified timeout - let jwt = self - .config - .safekeeper_jwt_token - .clone() - .map(SecretString::from); - let mut joinset = JoinSet::new(); + let safekeepers = self.get_safekeepers(&timeline_persistence.sk_set)?; - // Prepare membership::Configuration from choosen safekeepers. - let safekeepers = { - let locked = self.inner.read().unwrap(); - locked.safekeepers.clone() - }; - - let mut members = Vec::new(); - for sk_id in timeline_persistence.sk_set.iter() { - let sk_id = NodeId(*sk_id as u64); - let Some(safekeeper) = safekeepers.get(&sk_id) else { - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "couldn't find entry for safekeeper with id {sk_id}" - )))?; - }; - members.push(SafekeeperId { - id: sk_id, - host: safekeeper.skp.host.clone(), - pg_port: safekeeper.skp.port as u16, - }); - } - let mset = MemberSet::new(members).map_err(ApiError::InternalServerError)?; + let mset = Self::make_member_set(&safekeepers)?; let mconf = safekeeper_api::membership::Configuration::new(mset); let req = safekeeper_api::models::TimelineCreateRequest { @@ -89,79 +99,150 @@ impl Service { timeline_id, wal_seg_size: None, }; + const SK_CREATE_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); - for sk in timeline_persistence.sk_set.iter() { - let sk_id = NodeId(*sk as u64); - let safekeepers = safekeepers.clone(); + + let results = self + .tenant_timeline_safekeeper_op_quorum( + &safekeepers, + move |client| { + let req = req.clone(); + async move { client.create_timeline(&req).await } + }, + SK_CREATE_TIMELINE_RECONCILE_TIMEOUT, + ) + .await?; + + Ok(results + .into_iter() + .enumerate() + .filter_map(|(idx, res)| { + if res.is_ok() { + None // Success, don't return this safekeeper + } else { + Some(safekeepers[idx].get_id()) // Failure, return this safekeeper + } + }) + .collect::>()) + } + + /// Perform an operation on a list of safekeepers in parallel with retries. + /// + /// Return the results of the operation on each safekeeper in the input order. + async fn tenant_timeline_safekeeper_op( + &self, + safekeepers: &[Safekeeper], + op: O, + timeout: Duration, + ) -> Result>, ApiError> + where + O: FnMut(SafekeeperClient) -> F + Send + 'static, + O: Clone, + F: std::future::Future> + Send + 'static, + T: Sync + Send + 'static, + { + let jwt = self + .config + .safekeeper_jwt_token + .clone() + .map(SecretString::from); + let mut joinset = JoinSet::new(); + + for (idx, sk) in safekeepers.iter().enumerate() { + let sk = sk.clone(); let http_client = self.http_client.clone(); let jwt = jwt.clone(); - let req = req.clone(); + let op = op.clone(); joinset.spawn(async move { - // Unwrap is fine as we already would have returned error above - let sk_p = safekeepers.get(&sk_id).unwrap(); - let res = sk_p + let res = sk .with_client_retries( - |client| { - let req = req.clone(); - async move { client.create_timeline(&req).await } - }, + op, &http_client, &jwt, 3, 3, - SK_CREATE_TIMELINE_RECONCILE_TIMEOUT, + // TODO(diko): This is a wrong timeout. + // It should be scaled to the retry count. + timeout, &CancellationToken::new(), ) .await; - (sk_id, sk_p.skp.host.clone(), res) + (idx, res) }); } + + // Initialize results with timeout errors in case we never get a response. + let mut results: Vec> = safekeepers + .iter() + .map(|_| { + Err(mgmt_api::Error::Timeout( + "safekeeper operation timed out".to_string(), + )) + }) + .collect(); + // After we have built the joinset, we now wait for the tasks to complete, // but with a specified timeout to make sure we return swiftly, either with // a failure or success. - let reconcile_deadline = tokio::time::Instant::now() + SK_CREATE_TIMELINE_RECONCILE_TIMEOUT; + let reconcile_deadline = tokio::time::Instant::now() + timeout; // Wait until all tasks finish or timeout is hit, whichever occurs // first. - let mut reconcile_results = Vec::new(); + let mut result_count = 0; loop { if let Ok(res) = tokio::time::timeout_at(reconcile_deadline, joinset.join_next()).await { let Some(res) = res else { break }; match res { - Ok(res) => { + Ok((idx, res)) => { + let sk = &safekeepers[idx]; tracing::info!( "response from safekeeper id:{} at {}: {:?}", - res.0, - res.1, - res.2 + sk.get_id(), + sk.skp.host, + // Only print errors, as there is no Debug trait for T. + res.as_ref().map(|_| ()), ); - reconcile_results.push(res); + results[idx] = res; + result_count += 1; } Err(join_err) => { tracing::info!("join_err for task in joinset: {join_err}"); } } } else { - tracing::info!( - "timeout for creation call after {} responses", - reconcile_results.len() - ); + tracing::info!("timeout for operation call after {result_count} responses",); break; } } - // Now check now if quorum was reached in reconcile_results. - let total_result_count = reconcile_results.len(); - let remaining = reconcile_results - .into_iter() - .filter_map(|res| res.2.is_err().then_some(res.0)) - .collect::>(); - tracing::info!( - "Got {} non-successful responses from initial creation request of total {total_result_count} responses", - remaining.len() - ); - let target_sk_count = timeline_persistence.sk_set.len(); + Ok(results) + } + + /// Perform an operation on a list of safekeepers in parallel with retries, + /// and validates that we reach a quorum of successful responses. + /// + /// Return the results of the operation on each safekeeper in the input order. + /// It's guaranteed that at least a quorum of the responses are successful. + async fn tenant_timeline_safekeeper_op_quorum( + &self, + safekeepers: &[Safekeeper], + op: O, + timeout: Duration, + ) -> Result>, ApiError> + where + O: FnMut(SafekeeperClient) -> F, + O: Clone + Send + 'static, + F: std::future::Future> + Send + 'static, + T: Sync + Send + 'static, + { + let results = self + .tenant_timeline_safekeeper_op(safekeepers, op, timeout) + .await?; + + // Now check if quorum was reached in results. + + let target_sk_count = safekeepers.len(); let quorum_size = match target_sk_count { 0 => { return Err(ApiError::InternalServerError(anyhow::anyhow!( @@ -180,7 +261,7 @@ impl Service { // in order to schedule work to them tracing::warn!( "couldn't find at least 3 safekeepers for timeline, found: {:?}", - timeline_persistence.sk_set + target_sk_count ); return Err(ApiError::InternalServerError(anyhow::anyhow!( "couldn't find at least 3 safekeepers to put timeline to" @@ -189,7 +270,7 @@ impl Service { } _ => target_sk_count / 2 + 1, }; - let success_count = target_sk_count - remaining.len(); + let success_count = results.iter().filter(|res| res.is_ok()).count(); if success_count < quorum_size { // Failure return Err(ApiError::InternalServerError(anyhow::anyhow!( @@ -197,7 +278,7 @@ impl Service { ))); } - Ok(remaining) + Ok(results) } /// Create timeline in controller database and on safekeepers. @@ -654,13 +735,7 @@ impl Service { ) }); // Number of safekeepers in different AZs we are looking for - let mut wanted_count = self.config.timeline_safekeeper_count as usize; - // TODO(diko): remove this when `timeline_safekeeper_count` option is in the release - // branch and is specified in tests/neon_local config. - if cfg!(feature = "testing") && all_safekeepers.len() < wanted_count { - // In testing mode, we can have less safekeepers than the config says - wanted_count = max(all_safekeepers.len(), 1); - } + let wanted_count = self.config.timeline_safekeeper_count; let mut sks = Vec::new(); let mut azs = HashSet::new(); @@ -804,4 +879,486 @@ impl Service { } Ok(()) } + + /// Call `switch_timeline_membership` on all safekeepers with retries + /// till the quorum of successful responses is reached. + /// + /// If min_position is not None, validates that majority of safekeepers + /// reached at least min_position. + /// + /// Return responses from safekeepers in the input order. + async fn tenant_timeline_set_membership_quorum( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + safekeepers: &[Safekeeper], + config: &membership::Configuration, + min_position: Option<(Term, Lsn)>, + ) -> Result>, ApiError> { + let req = TimelineMembershipSwitchRequest { + mconf: config.clone(), + }; + + const SK_SET_MEM_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); + + let results = self + .tenant_timeline_safekeeper_op_quorum( + safekeepers, + move |client| { + let req = req.clone(); + async move { + let mut res = client + .switch_timeline_membership(tenant_id, timeline_id, &req) + .await; + + // If min_position is not reached, map the response to an error, + // so it isn't counted toward the quorum. + if let Some(min_position) = min_position { + if let Ok(ok_res) = &res { + if (ok_res.term, ok_res.flush_lsn) < min_position { + // Use Error::Timeout to make this error retriable. + res = Err(mgmt_api::Error::Timeout( + format!( + "safekeeper {} returned position {:?} which is less than minimum required position {:?}", + client.node_id_label(), + (ok_res.term, ok_res.flush_lsn), + min_position + ) + )); + } + } + } + + res + } + }, + SK_SET_MEM_TIMELINE_RECONCILE_TIMEOUT, + ) + .await?; + + for res in results.iter().flatten() { + if res.current_conf.generation > config.generation { + // Antoher switch_membership raced us. + return Err(ApiError::Conflict(format!( + "received configuration with generation {} from safekeeper, but expected {}", + res.current_conf.generation, config.generation + ))); + } else if res.current_conf.generation < config.generation { + // Note: should never happen. + // If we get a response, it should be at least the sent generation. + tracing::error!( + "received configuration with generation {} from safekeeper, but expected {}", + res.current_conf.generation, + config.generation + ); + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "received configuration with generation {} from safekeeper, but expected {}", + res.current_conf.generation, + config.generation + ))); + } + } + + Ok(results) + } + + /// Pull timeline to to_safekeepers from from_safekeepers with retries. + /// + /// Returns Ok(()) only if all the pull_timeline requests were successful. + async fn tenant_timeline_pull_from_peers( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + to_safekeepers: &[Safekeeper], + from_safekeepers: &[Safekeeper], + ) -> Result<(), ApiError> { + let http_hosts = from_safekeepers + .iter() + .map(|sk| sk.base_url()) + .collect::>(); + + tracing::info!( + "pulling timeline to {:?} from {:?}", + to_safekeepers + .iter() + .map(|sk| sk.get_id()) + .collect::>(), + from_safekeepers + .iter() + .map(|sk| sk.get_id()) + .collect::>() + ); + + // TODO(diko): need to pass mconf/generation with the request + // to properly handle tombstones. Ignore tombstones for now. + // Worst case: we leave a timeline on a safekeeper which is not in the current set. + let req = PullTimelineRequest { + tenant_id, + timeline_id, + http_hosts, + ignore_tombstone: Some(true), + }; + + const SK_PULL_TIMELINE_RECONCILE_TIMEOUT: Duration = Duration::from_secs(30); + + let responses = self + .tenant_timeline_safekeeper_op( + to_safekeepers, + move |client| { + let req = req.clone(); + async move { client.pull_timeline(&req).await } + }, + SK_PULL_TIMELINE_RECONCILE_TIMEOUT, + ) + .await?; + + if let Some((idx, err)) = responses + .iter() + .enumerate() + .find_map(|(idx, res)| Some((idx, res.as_ref().err()?))) + { + let sk_id = to_safekeepers[idx].get_id(); + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "pull_timeline to {sk_id} failed: {err}", + ))); + } + + Ok(()) + } + + /// Exclude a timeline from safekeepers in parallel with retries. + /// If an exclude request is unsuccessful, it will be added to + /// the reconciler, and after that the function will succeed. + async fn tenant_timeline_safekeeper_exclude( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + safekeepers: &[Safekeeper], + config: &membership::Configuration, + ) -> Result<(), ApiError> { + let req = TimelineMembershipSwitchRequest { + mconf: config.clone(), + }; + + const SK_EXCLUDE_TIMELINE_TIMEOUT: Duration = Duration::from_secs(30); + + let results = self + .tenant_timeline_safekeeper_op( + safekeepers, + move |client| { + let req = req.clone(); + async move { client.exclude_timeline(tenant_id, timeline_id, &req).await } + }, + SK_EXCLUDE_TIMELINE_TIMEOUT, + ) + .await?; + + let mut reconcile_requests = Vec::new(); + + for (idx, res) in results.iter().enumerate() { + if res.is_err() { + let sk_id = safekeepers[idx].skp.id; + let pending_op = TimelinePendingOpPersistence { + tenant_id: tenant_id.to_string(), + timeline_id: timeline_id.to_string(), + generation: config.generation.into_inner() as i32, + op_kind: SafekeeperTimelineOpKind::Exclude, + sk_id, + }; + tracing::info!("writing pending exclude op for sk id {sk_id}"); + self.persistence.insert_pending_op(pending_op).await?; + + let req = ScheduleRequest { + safekeeper: Box::new(safekeepers[idx].clone()), + host_list: Vec::new(), + tenant_id, + timeline_id: Some(timeline_id), + generation: config.generation.into_inner(), + kind: SafekeeperTimelineOpKind::Exclude, + }; + reconcile_requests.push(req); + } + } + + if !reconcile_requests.is_empty() { + let locked = self.inner.read().unwrap(); + for req in reconcile_requests { + locked.safekeeper_reconcilers.schedule_request(req); + } + } + + Ok(()) + } + + /// Migrate timeline safekeeper set to a new set. + /// + /// This function implements an algorithm from RFC-035. + /// + pub(crate) async fn tenant_timeline_safekeeper_migrate( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + req: TimelineSafekeeperMigrateRequest, + ) -> Result<(), ApiError> { + let all_safekeepers = self.inner.read().unwrap().safekeepers.clone(); + + let new_sk_set = req.new_sk_set; + + for sk_id in new_sk_set.iter() { + if !all_safekeepers.contains_key(sk_id) { + return Err(ApiError::BadRequest(anyhow::anyhow!( + "safekeeper {sk_id} does not exist" + ))); + } + } + + // TODO(diko): per-tenant lock is too wide. Consider introducing per-timeline locks. + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineSafekeeperMigrate, + ) + .await; + + // 1. Fetch current timeline configuration from the configuration storage. + + let timeline = self + .persistence + .get_timeline(tenant_id, timeline_id) + .await?; + + let Some(timeline) = timeline else { + return Err(ApiError::NotFound( + anyhow::anyhow!( + "timeline {tenant_id}/{timeline_id} doesn't exist in timelines table" + ) + .into(), + )); + }; + + let cur_sk_set = timeline + .sk_set + .iter() + .map(|&id| NodeId(id as u64)) + .collect::>(); + + tracing::info!( + ?cur_sk_set, + ?new_sk_set, + "Migrating timeline to new safekeeper set", + ); + + let mut generation = SafekeeperGeneration::new(timeline.generation as u32); + + if let Some(ref presistent_new_sk_set) = timeline.new_sk_set { + // 2. If it is already joint one and new_set is different from desired_set refuse to change. + if presistent_new_sk_set + .iter() + .map(|&id| NodeId(id as u64)) + .ne(new_sk_set.iter().cloned()) + { + tracing::info!( + ?presistent_new_sk_set, + ?new_sk_set, + "different new safekeeper set is already set in the database", + ); + return Err(ApiError::Conflict(format!( + "the timeline is already migrating to a different safekeeper set: {presistent_new_sk_set:?}" + ))); + } + // It it is the same new_sk_set, we can continue the migration (retry). + } else { + // 3. No active migration yet. + // Increment current generation and put desired_set to new_sk_set. + generation = generation.next(); + + self.persistence + .update_timeline_membership( + tenant_id, + timeline_id, + generation, + &cur_sk_set, + Some(&new_sk_set), + ) + .await?; + } + + let cur_safekeepers = self.get_safekeepers(&timeline.sk_set)?; + let cur_sk_member_set = Self::make_member_set(&cur_safekeepers)?; + + let new_sk_set_i64 = new_sk_set.iter().map(|id| id.0 as i64).collect::>(); + let new_safekeepers = self.get_safekeepers(&new_sk_set_i64)?; + let new_sk_member_set = Self::make_member_set(&new_safekeepers)?; + + let joint_config = membership::Configuration { + generation, + members: cur_sk_member_set, + new_members: Some(new_sk_member_set.clone()), + }; + + // 4. Call PUT configuration on safekeepers from the current set, + // delivering them joint_conf. + + // Notify cplane/compute about the membership change BEFORE changing the membership on safekeepers. + // This way the compute will know about new safekeepers from joint_config before we require to + // collect a quorum from them. + self.cplane_notify_safekeepers(tenant_id, timeline_id, &joint_config) + .await?; + + let results = self + .tenant_timeline_set_membership_quorum( + tenant_id, + timeline_id, + &cur_safekeepers, + &joint_config, + None, // no min position + ) + .await?; + + let mut sync_position = (INITIAL_TERM, Lsn::INVALID); + for res in results.into_iter().flatten() { + let sk_position = (res.term, res.flush_lsn); + if sync_position < sk_position { + sync_position = sk_position; + } + } + + tracing::info!( + %generation, + ?sync_position, + "safekeepers set membership updated", + ); + + // 5. Initialize timeline on safekeeper(s) from new_sk_set where it doesn't exist yet + // by doing pull_timeline from the majority of the current set. + + // Filter out safekeepers which are already in the current set. + let from_ids: HashSet = cur_safekeepers.iter().map(|sk| sk.get_id()).collect(); + let pull_to_safekeepers = new_safekeepers + .iter() + .filter(|sk| !from_ids.contains(&sk.get_id())) + .cloned() + .collect::>(); + + self.tenant_timeline_pull_from_peers( + tenant_id, + timeline_id, + &pull_to_safekeepers, + &cur_safekeepers, + ) + .await?; + + // 6. Call POST bump_term(sync_term) on safekeepers from the new set. Success on majority is enough. + + // TODO(diko): do we need to bump timeline term? + + // 7. Repeatedly call PUT configuration on safekeepers from the new set, + // delivering them joint_conf and collecting their positions. + + tracing::info!(?sync_position, "waiting for safekeepers to sync position"); + + self.tenant_timeline_set_membership_quorum( + tenant_id, + timeline_id, + &new_safekeepers, + &joint_config, + Some(sync_position), + ) + .await?; + + // 8. Create new_conf: Configuration incrementing joint_conf generation and + // having new safekeeper set as sk_set and None new_sk_set. + + let generation = generation.next(); + + let new_conf = membership::Configuration { + generation, + members: new_sk_member_set, + new_members: None, + }; + + self.persistence + .update_timeline_membership(tenant_id, timeline_id, generation, &new_sk_set, None) + .await?; + + // TODO(diko): at this point we have already updated the timeline in the database, + // but we still need to notify safekeepers and cplane about the new configuration, + // and put delition of the timeline from the old safekeepers into the reconciler. + // Ideally it should be done atomically, but now it's not. + // Worst case: the timeline is not deleted from old safekeepers, + // the compute may require both quorums till the migration is retried and completed. + + self.tenant_timeline_set_membership_quorum( + tenant_id, + timeline_id, + &new_safekeepers, + &new_conf, + None, // no min position + ) + .await?; + + let new_ids: HashSet = new_safekeepers.iter().map(|sk| sk.get_id()).collect(); + let exclude_safekeepers = cur_safekeepers + .into_iter() + .filter(|sk| !new_ids.contains(&sk.get_id())) + .collect::>(); + self.tenant_timeline_safekeeper_exclude( + tenant_id, + timeline_id, + &exclude_safekeepers, + &new_conf, + ) + .await?; + + // Notify cplane/compute about the membership change AFTER changing the membership on safekeepers. + // This way the compute will stop talking to excluded safekeepers only after we stop requiring to + // collect a quorum from them. + self.cplane_notify_safekeepers(tenant_id, timeline_id, &new_conf) + .await?; + + Ok(()) + } + + /// Notify cplane about safekeeper membership change. + /// The cplane will receive a joint set of safekeepers as a safekeeper list. + async fn cplane_notify_safekeepers( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + mconf: &membership::Configuration, + ) -> Result<(), ApiError> { + let mut safekeepers = Vec::new(); + let mut ids: HashSet<_> = HashSet::new(); + + for member in mconf + .members + .m + .iter() + .chain(mconf.new_members.iter().flat_map(|m| m.m.iter())) + { + if ids.insert(member.id) { + safekeepers.push(compute_hook::SafekeeperInfo { + id: member.id, + hostname: Some(member.host.clone()), + }); + } + } + + self.compute_hook + .notify_safekeepers( + compute_hook::SafekeepersUpdate { + tenant_id, + timeline_id, + generation: mconf.generation, + safekeepers, + }, + &self.cancel, + ) + .await + .map_err(|err| { + ApiError::InternalServerError(anyhow::anyhow!( + "failed to notify cplane about safekeeper membership change: {err}" + )) + }) + } } diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 4eb85119ca..2031ec132e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -453,7 +453,7 @@ class NeonEnvBuilder: pageserver_get_vectored_concurrent_io: str | None = None, pageserver_tracing_config: PageserverTracingConfig | None = None, pageserver_import_config: PageserverImportConfig | None = None, - storcon_kick_secondary_downloads: bool | None = None, + storcon_kick_secondary_downloads: bool | None = True, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -724,15 +724,21 @@ class NeonEnvBuilder: shutil.copytree(storcon_db_from_dir, storcon_db_to_dir, ignore=ignore_postgres_log) assert not (storcon_db_to_dir / "postgres.log").exists() + # NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it. - # However, in this new NeonEnv, the pageservers listen on different ports, and the storage controller - # will currently reject re-attach requests from them because the NodeMetadata isn't identical. + # However, in this new NeonEnv, the pageservers and safekeepers listen on different ports, and the storage + # controller will currently reject re-attach requests from them because the NodeMetadata isn't identical. # So, from_repo_dir patches up the the storcon database. patch_script_path = self.repo_dir / "storage_controller_db.startup.sql" assert not patch_script_path.exists() patch_script = "" + for ps in self.env.pageservers: - patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';" + patch_script += f"UPDATE nodes SET listen_http_port={ps.service_port.http}, listen_pg_port={ps.service_port.pg} WHERE node_id = '{ps.id}';\n" + + for sk in self.env.safekeepers: + patch_script += f"UPDATE safekeepers SET http_port={sk.port.http}, port={sk.port.pg} WHERE id = '{sk.id}';\n" + patch_script_path.write_text(patch_script) # Update the config with info about tenants and timelines @@ -1215,6 +1221,13 @@ class NeonEnv: storage_controller_config = storage_controller_config or {} storage_controller_config["use_https_safekeeper_api"] = True + # TODO(diko): uncomment when timeline_safekeeper_count option is in the release branch, + # so the compat tests will not fail bacause of it presence. + # if config.num_safekeepers < 3: + # storage_controller_config = storage_controller_config or {} + # if "timeline_safekeeper_count" not in storage_controller_config: + # storage_controller_config["timeline_safekeeper_count"] = config.num_safekeepers + if storage_controller_config is not None: cfg["storage_controller"] = storage_controller_config @@ -2226,6 +2239,21 @@ class NeonStorageController(MetricsGetter, LogUtils): response.raise_for_status() log.info(f"timeline_create success: {response.json()}") + def migrate_safekeepers( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + new_sk_set: list[int], + ): + response = self.request( + "POST", + f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate", + json={"new_sk_set": new_sk_set}, + headers=self.headers(TokenScope.PAGE_SERVER_API), + ) + response.raise_for_status() + log.info(f"migrate_safekeepers success: {response.json()}") + def locate(self, tenant_id: TenantId) -> list[dict[str, Any]]: """ :return: list of {"shard_id": "", "node_id": int, "listen_pg_addr": str, "listen_pg_port": int, "listen_http_addr": str, "listen_http_port": int} diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index d9037f2d08..d60fc34fa3 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -695,6 +695,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, gc_horizon: int | None, + ignore_lease_deadline: bool | None = None, ) -> dict[str, Any]: """ Unlike most handlers, this will wait for the layers to be actually @@ -707,7 +708,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): ) res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc", - json={"gc_horizon": gc_horizon}, + json={"gc_horizon": gc_horizon, "ignore_lease_deadline": ignore_lease_deadline}, ) log.info(f"Got GC request response code: {res.status_code}") self.verbose_error(res) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 16ab2bb359..a4d2bf8d9b 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -76,6 +76,7 @@ if TYPE_CHECKING: # export CHECK_ONDISK_DATA_COMPATIBILITY=true # export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} # export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install +# export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION} # # # Build previous version of binaries and store them somewhere: # rm -rf pg_install target @@ -102,6 +103,7 @@ if TYPE_CHECKING: # export CHECK_ONDISK_DATA_COMPATIBILITY=true # export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE} # export COMPATIBILITY_POSTGRES_DISTRIB_DIR=neon_previous/pg_install +# export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION} # export NEON_BIN=target/${BUILD_TYPE} # export POSTGRES_DISTRIB_DIR=pg_install # diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index ee934a900d..98877ad7f9 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -201,11 +201,13 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): for shard, ps in tenant_get_shards(env, env.initial_tenant): client = ps.http_client() layers_guarded_before_gc = get_layers_protected_by_lease( - client, shard, env.initial_timeline, lease_lsn=lsn + client, shard, env.initial_timeline, lease_lsn=lease_lsn + ) + gc_result = client.timeline_gc( + shard, env.initial_timeline, 0, ignore_lease_deadline=False ) - gc_result = client.timeline_gc(shard, env.initial_timeline, 0) layers_guarded_after_gc = get_layers_protected_by_lease( - client, shard, env.initial_timeline, lease_lsn=lsn + client, shard, env.initial_timeline, lease_lsn=lease_lsn ) # Note: cannot assert on `layers_removed` here because it could be layers diff --git a/test_runner/regress/test_safekeeper_migration.py b/test_runner/regress/test_safekeeper_migration.py new file mode 100644 index 0000000000..057371175c --- /dev/null +++ b/test_runner/regress/test_safekeeper_migration.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnvBuilder + + +def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder): + """ + Simple safekeeper migration test. + Creates 3 safekeepers. The timeline is configuret to use only one safekeeper. + 1. Go through all safekeepers, migrate the timeline to it. + 2. Stop the other safekeepers. Validate that the insert is successful. + 3. Start the other safekeepers again and go to the next safekeeper. + 4. Validate that the table contains all inserted values. + """ + neon_env_builder.num_safekeepers = 3 + neon_env_builder.storage_controller_config = { + "timelines_onto_safekeepers": True, + "timeline_safekeeper_count": 1, + } + env = neon_env_builder.init_start() + # TODO(diko): pageserver spams with various errors during safekeeper migration. + # Fix the code so it handles the migration better. + env.pageserver.allowed_errors.extend( + [ + ".*Timeline .* was cancelled and cannot be used anymore.*", + ".*Timeline .* has been deleted.*", + ".*wal receiver task finished with an error.*", + ] + ) + + ep = env.endpoints.create("main", tenant_id=env.initial_tenant) + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert mconf["new_sk_set"] is None + assert len(mconf["sk_set"]) == 1 + assert mconf["generation"] == 1 + + ep.start(safekeeper_generation=1, safekeepers=mconf["sk_set"]) + ep.safe_psql("CREATE EXTENSION neon_test_utils;") + ep.safe_psql("CREATE TABLE t(a int)") + + for active_sk in range(1, 4): + env.storage_controller.migrate_safekeepers( + env.initial_tenant, env.initial_timeline, [active_sk] + ) + + other_sks = [sk for sk in range(1, 4) if sk != active_sk] + + for sk in other_sks: + env.safekeepers[sk - 1].stop() + + ep.safe_psql(f"INSERT INTO t VALUES ({active_sk})") + + for sk in other_sks: + env.safekeepers[sk - 1].start() + + ep.clear_buffers() + + assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)] + + # 1 initial generation + 2 migrations on each loop iteration. + expected_gen = 1 + 2 * 3 + + mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline) + assert mconf["generation"] == expected_gen + + assert ep.safe_psql("SHOW neon.safekeepers")[0][0].startswith(f"g#{expected_gen}:") + + # Restart and check again to make sure data is persistent. + ep.stop() + ep.start(safekeeper_generation=1, safekeepers=[3]) + + assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)]