diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000..c8fd1209de --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,6 @@ + +blank_issues_enabled: true +contact_links: + - name: Feature request + url: https://console.neon.tech/app/projects?modal=feedback + about: For feature requests in the Neon product, please submit via the feedback form on `https://console.neon.tech` diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index 6c2cee0971..4008cd0d36 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -71,7 +71,7 @@ runs: if: inputs.build_type != 'remote' uses: ./.github/actions/download with: - name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }} + name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }} path: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }} prefix: latest # The lack of compatibility snapshot (for example, for the new Postgres version) @@ -211,13 +211,13 @@ runs: fi - name: Upload compatibility snapshot - if: github.ref_name == 'release' + # Note, that we use `github.base_ref` which is a target branch for a PR + if: github.event_name == 'pull_request' && github.base_ref == 'release' uses: ./.github/actions/upload with: - name: compatibility-snapshot-${{ inputs.build_type }}-pg${{ inputs.pg_version }}-${{ github.run_id }} + name: compatibility-snapshot-${{ runner.arch }}-${{ inputs.build_type }}-pg${{ inputs.pg_version }} # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test path: /tmp/test_output/compatibility_snapshot_pg${{ inputs.pg_version }}/ - prefix: latest - name: Upload test results if: ${{ !cancelled() }} diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 5e9fff0e6a..e18e6a1201 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -216,8 +216,14 @@ jobs: #nextest does not yet support running doctests ${cov_prefix} cargo test --doc $CARGO_FLAGS $CARGO_FEATURES + # run all non-pageserver tests + ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)' + + # run pageserver tests with different settings for io_engine in std-fs tokio-epoll-uring ; do - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES + for io_buffer_alignment in 0 1 512 ; do + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT=$io_buffer_alignment ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' + done done # Run separate tests for real S3 diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1e7f3598c2..ee5fd1b0c6 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -286,6 +286,7 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring + SYNC_AFTER_EACH_TEST: true # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones @@ -1055,43 +1056,88 @@ jobs: generate_release_notes: true, }) + # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory promote-compatibility-data: - needs: [ check-permissions, promote-images, tag, build-and-test-locally ] + needs: [ deploy ] if: github.ref_name == 'release' - runs-on: [ self-hosted, small ] - container: - image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned - options: --init + runs-on: ubuntu-22.04 steps: - - name: Promote compatibility snapshot for the release + - name: Fetch GITHUB_RUN_ID and COMMIT_SHA for the last merged release PR + id: fetch-last-release-pr-info + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + branch_name_and_pr_number=$(gh pr list \ + --repo "${GITHUB_REPOSITORY}" \ + --base release \ + --state merged \ + --limit 10 \ + --json mergeCommit,headRefName,number \ + --jq ".[] | select(.mergeCommit.oid==\"${GITHUB_SHA}\") | { branch_name: .headRefName, pr_number: .number }") + branch_name=$(echo "${branch_name_and_pr_number}" | jq -r '.branch_name') + pr_number=$(echo "${branch_name_and_pr_number}" | jq -r '.pr_number') + + run_id=$(gh run list \ + --repo "${GITHUB_REPOSITORY}" \ + --workflow build_and_test.yml \ + --branch "${branch_name}" \ + --json databaseId \ + --limit 1 \ + --jq '.[].databaseId') + + last_commit_sha=$(gh pr view "${pr_number}" \ + --repo "${GITHUB_REPOSITORY}" \ + --json commits \ + --jq '.commits[-1].oid') + + echo "run-id=${run_id}" | tee -a ${GITHUB_OUTPUT} + echo "commit-sha=${last_commit_sha}" | tee -a ${GITHUB_OUTPUT} + + - name: Promote compatibility snapshot and Neon artifact env: BUCKET: neon-github-public-dev - PREFIX: artifacts/latest - COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + AWS_REGION: eu-central-1 + COMMIT_SHA: ${{ steps.fetch-last-release-pr-info.outputs.commit-sha }} + RUN_ID: ${{ steps.fetch-last-release-pr-info.outputs.run-id }} run: | - # Update compatibility snapshot for the release - for pg_version in v14 v15 v16; do - for build_type in debug release; do - OLD_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}-${GITHUB_RUN_ID}.tar.zst - NEW_FILENAME=compatibility-snapshot-${build_type}-pg${pg_version}.tar.zst + old_prefix="artifacts/${COMMIT_SHA}/${RUN_ID}" + new_prefix="artifacts/latest" - time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME} + files_to_promote=() + files_on_s3=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${old_prefix} | jq -r '.Contents[]?.Key' || true) + + for arch in X64 ARM64; do + for build_type in debug release; do + neon_artifact_filename="neon-Linux-${arch}-${build_type}-artifact.tar.zst" + s3_key=$(echo "${files_on_s3}" | grep ${neon_artifact_filename} | sort --version-sort | tail -1 || true) + if [ -z "${s3_key}" ]; then + echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${neon_artifact_filename} nor its version from previous attempts exist" + exit 1 + fi + + files_to_promote+=("s3://${BUCKET}/${s3_key}") + + for pg_version in v14 v15 v16; do + # We run less tests for debug builds, so we don't need to promote them + if [ "${build_type}" == "debug" ] && { [ "${arch}" == "ARM64" ] || [ "${pg_version}" != "v16" ] ; }; then + continue + fi + + compatibility_data_filename="compatibility-snapshot-${arch}-${build_type}-pg${pg_version}.tar.zst" + s3_key=$(echo "${files_on_s3}" | grep ${compatibility_data_filename} | sort --version-sort | tail -1 || true) + if [ -z "${s3_key}" ]; then + echo >&2 "Neither s3://${BUCKET}/${old_prefix}/${compatibility_data_filename} nor its version from previous attempts exist" + exit 1 + fi + + files_to_promote+=("s3://${BUCKET}/${s3_key}") + done done done - # Update Neon artifact for the release (reuse already uploaded artifact) - for build_type in debug release; do - OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID} - FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst - - S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) - if [ -z "${S3_KEY}" ]; then - echo >&2 "Neither s3://${BUCKET}/${OLD_PREFIX}/${FILENAME} nor its version from previous attempts exist" - exit 1 - fi - - time aws s3 cp --only-show-errors s3://${BUCKET}/${S3_KEY} s3://${BUCKET}/${PREFIX}/${FILENAME} + for f in "${files_to_promote[@]}"; do + time aws s3 cp --only-show-errors ${f} s3://${BUCKET}/${new_prefix}/ done pin-build-tools-image: diff --git a/Cargo.lock b/Cargo.lock index 441ca1ff86..91917d5351 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -936,6 +936,12 @@ dependencies = [ "which", ] +[[package]] +name = "bit_field" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc827186963e592360843fb5ba4b973e145841266c1357f7180c43526f2e5b61" + [[package]] name = "bitflags" version = "1.3.2" @@ -1327,7 +1333,6 @@ name = "control_plane" version = "0.1.0" dependencies = [ "anyhow", - "async-trait", "camino", "clap", "comfy-table", @@ -2722,6 +2727,12 @@ dependencies = [ "hashbrown 0.14.5", ] +[[package]] +name = "indoc" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" + [[package]] name = "infer" version = "0.2.3" @@ -2944,17 +2955,6 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" -[[package]] -name = "leaky-bucket" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853" -dependencies = [ - "parking_lot 0.12.1", - "tokio", - "tracing", -] - [[package]] name = "libc" version = "0.2.150" @@ -3683,6 +3683,7 @@ dependencies = [ "async-compression", "async-stream", "async-trait", + "bit_field", "byteorder", "bytes", "camino", @@ -3706,8 +3707,8 @@ dependencies = [ "humantime", "humantime-serde", "hyper 0.14.26", + "indoc", "itertools 0.10.5", - "leaky-bucket", "md5", "metrics", "nix 0.27.1", @@ -3732,6 +3733,7 @@ dependencies = [ "reqwest 0.12.4", "rpds", "scopeguard", + "send-future", "serde", "serde_json", "serde_path_to_error", @@ -3771,6 +3773,7 @@ dependencies = [ "bincode", "byteorder", "bytes", + "camino", "chrono", "const_format", "enum-map", @@ -3778,11 +3781,16 @@ dependencies = [ "humantime", "humantime-serde", "itertools 0.10.5", + "nix 0.27.1", + "postgres_backend", "postgres_ffi", "rand 0.8.5", + "remote_storage", + "reqwest 0.12.4", "serde", "serde_json", "serde_with", + "storage_broker", "strum", "strum_macros", "thiserror", @@ -3794,7 +3802,6 @@ name = "pageserver_client" version = "0.1.0" dependencies = [ "anyhow", - "async-trait", "bytes", "futures", "pageserver_api", @@ -5455,6 +5462,12 @@ version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bebd363326d05ec3e2f532ab7660680f3b02130d780c299bca73469d521bc0ed" +[[package]] +name = "send-future" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224e328af6e080cddbab3c770b1cf50f0351ba0577091ef2410c3951d835ff87" + [[package]] name = "sentry" version = "0.32.3" @@ -5950,7 +5963,6 @@ name = "storage_controller_client" version = "0.1.0" dependencies = [ "anyhow", - "async-trait", "bytes", "futures", "pageserver_api", @@ -6953,7 +6965,6 @@ dependencies = [ "anyhow", "arc-swap", "async-compression", - "async-trait", "bincode", "byteorder", "bytes", @@ -6969,7 +6980,6 @@ dependencies = [ "humantime", "hyper 0.14.26", "jsonwebtoken", - "leaky-bucket", "metrics", "nix 0.27.1", "once_cell", diff --git a/Cargo.toml b/Cargo.toml index e038c0b4ff..4fea3e8d80 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -65,6 +65,7 @@ axum = { version = "0.6.20", features = ["ws"] } base64 = "0.13.0" bincode = "1.3" bindgen = "0.65" +bit_field = "0.10.2" bstr = "1.0" byteorder = "1.4" bytes = "1.0" @@ -102,12 +103,12 @@ humantime-serde = "1.1.1" hyper = "0.14" tokio-tungstenite = "0.20.0" indexmap = "2" +indoc = "2" inotify = "0.10.2" ipnet = "2.9.0" itertools = "0.10" jsonwebtoken = "9" lasso = "0.7" -leaky-bucket = "1.0.1" libc = "0.2" md5 = "0.7.0" measured = { version = "0.0.22", features=["lasso"] } @@ -145,6 +146,7 @@ rustls-split = "0.3" scopeguard = "1.1" sysinfo = "0.29.2" sd-notify = "0.4.1" +send-future = "0.1.0" sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] } serde = { version = "1.0", features = ["derive"] } serde_json = "1" diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 0ba2c1aeb4..9499a7186e 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -44,6 +44,7 @@ use std::{thread, time::Duration}; use anyhow::{Context, Result}; use chrono::Utc; use clap::Arg; +use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static; use signal_hook::consts::{SIGQUIT, SIGTERM}; use signal_hook::{consts::SIGINT, iterator::Signals}; use tracing::{error, info, warn}; @@ -366,6 +367,8 @@ fn wait_spec( state.start_time = now; } + launch_lsn_lease_bg_task_for_static(&compute); + Ok(WaitSpecResult { compute, http_port, diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 543d4462ed..c402d63305 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -11,6 +11,7 @@ pub mod logger; pub mod catalog; pub mod compute; pub mod extension_server; +pub mod lsn_lease; mod migration; pub mod monitor; pub mod params; diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs new file mode 100644 index 0000000000..7e5917c55f --- /dev/null +++ b/compute_tools/src/lsn_lease.rs @@ -0,0 +1,186 @@ +use anyhow::bail; +use anyhow::Result; +use postgres::{NoTls, SimpleQueryMessage}; +use std::time::SystemTime; +use std::{str::FromStr, sync::Arc, thread, time::Duration}; +use utils::id::TenantId; +use utils::id::TimelineId; + +use compute_api::spec::ComputeMode; +use tracing::{info, warn}; +use utils::{ + lsn::Lsn, + shard::{ShardCount, ShardNumber, TenantShardId}, +}; + +use crate::compute::ComputeNode; + +/// Spawns a background thread to periodically renew LSN leases for static compute. +/// Do nothing if the compute is not in static mode. +pub fn launch_lsn_lease_bg_task_for_static(compute: &Arc) { + let (tenant_id, timeline_id, lsn) = { + let state = compute.state.lock().unwrap(); + let spec = state.pspec.as_ref().expect("Spec must be set"); + match spec.spec.mode { + ComputeMode::Static(lsn) => (spec.tenant_id, spec.timeline_id, lsn), + _ => return, + } + }; + let compute = compute.clone(); + + let span = tracing::info_span!("lsn_lease_bg_task", %tenant_id, %timeline_id, %lsn); + thread::spawn(move || { + let _entered = span.entered(); + if let Err(e) = lsn_lease_bg_task(compute, tenant_id, timeline_id, lsn) { + // TODO: might need stronger error feedback than logging an warning. + warn!("Exited with error: {e}"); + } + }); +} + +/// Renews lsn lease periodically so static compute are not affected by GC. +fn lsn_lease_bg_task( + compute: Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + lsn: Lsn, +) -> Result<()> { + loop { + let valid_until = acquire_lsn_lease_with_retry(&compute, tenant_id, timeline_id, lsn)?; + let valid_duration = valid_until + .duration_since(SystemTime::now()) + .unwrap_or(Duration::ZERO); + + // Sleep for 60 seconds less than the valid duration but no more than half of the valid duration. + let sleep_duration = valid_duration + .saturating_sub(Duration::from_secs(60)) + .max(valid_duration / 2); + + info!( + "Succeeded, sleeping for {} seconds", + sleep_duration.as_secs() + ); + thread::sleep(sleep_duration); + } +} + +/// Acquires lsn lease in a retry loop. Returns the expiration time if a lease is granted. +/// Returns an error if a lease is explicitly not granted. Otherwise, we keep sending requests. +fn acquire_lsn_lease_with_retry( + compute: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + lsn: Lsn, +) -> Result { + let mut attempts = 0usize; + let mut retry_period_ms: f64 = 500.0; + const MAX_RETRY_PERIOD_MS: f64 = 60.0 * 1000.0; + + loop { + // Note: List of pageservers is dynamic, need to re-read configs before each attempt. + let configs = { + let state = compute.state.lock().unwrap(); + + let spec = state.pspec.as_ref().expect("spec must be set"); + + let conn_strings = spec.pageserver_connstr.split(','); + + conn_strings + .map(|connstr| { + let mut config = postgres::Config::from_str(connstr).expect("Invalid connstr"); + if let Some(storage_auth_token) = &spec.storage_auth_token { + info!("Got storage auth token from spec file"); + config.password(storage_auth_token.clone()); + } else { + info!("Storage auth token not set"); + } + config + }) + .collect::>() + }; + + let result = try_acquire_lsn_lease(tenant_id, timeline_id, lsn, &configs); + match result { + Ok(Some(res)) => { + return Ok(res); + } + Ok(None) => { + bail!("Permanent error: lease could not be obtained, LSN is behind the GC cutoff"); + } + Err(e) => { + warn!("Failed to acquire lsn lease: {e} (attempt {attempts}"); + + thread::sleep(Duration::from_millis(retry_period_ms as u64)); + retry_period_ms *= 1.5; + retry_period_ms = retry_period_ms.min(MAX_RETRY_PERIOD_MS); + } + } + attempts += 1; + } +} + +/// Tries to acquire an LSN lease through PS page_service API. +fn try_acquire_lsn_lease( + tenant_id: TenantId, + timeline_id: TimelineId, + lsn: Lsn, + configs: &[postgres::Config], +) -> Result> { + fn get_valid_until( + config: &postgres::Config, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + lsn: Lsn, + ) -> Result> { + let mut client = config.connect(NoTls)?; + let cmd = format!("lease lsn {} {} {} ", tenant_shard_id, timeline_id, lsn); + let res = client.simple_query(&cmd)?; + let msg = match res.first() { + Some(msg) => msg, + None => bail!("empty response"), + }; + let row = match msg { + SimpleQueryMessage::Row(row) => row, + _ => bail!("error parsing lsn lease response"), + }; + + // Note: this will be None if a lease is explicitly not granted. + let valid_until_str = row.get("valid_until"); + + let valid_until = valid_until_str.map(|s| { + SystemTime::UNIX_EPOCH + .checked_add(Duration::from_millis(u128::from_str(s).unwrap() as u64)) + .expect("Time larger than max SystemTime could handle") + }); + Ok(valid_until) + } + + let shard_count = configs.len(); + + let valid_until = if shard_count > 1 { + configs + .iter() + .enumerate() + .map(|(shard_number, config)| { + let tenant_shard_id = TenantShardId { + tenant_id, + shard_count: ShardCount::new(shard_count as u8), + shard_number: ShardNumber(shard_number as u8), + }; + get_valid_until(config, tenant_shard_id, timeline_id, lsn) + }) + .collect::>>>()? + .into_iter() + .min() + .unwrap() + } else { + get_valid_until( + &configs[0], + TenantShardId::unsharded(tenant_id), + timeline_id, + lsn, + )? + }; + + Ok(valid_until) +} diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 487ac8f047..6fca59b368 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -6,7 +6,6 @@ license.workspace = true [dependencies] anyhow.workspace = true -async-trait.workspace = true camino.workspace = true clap.workspace = true comfy-table.workspace = true diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 74caba2b56..5dbc3bcbbc 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -165,6 +165,9 @@ pub struct NeonStorageControllerConf { pub split_threshold: Option, pub max_secondary_lag_bytes: Option, + + #[serde(with = "humantime_serde")] + pub heartbeat_interval: Duration, } impl NeonStorageControllerConf { @@ -172,6 +175,9 @@ impl NeonStorageControllerConf { const DEFAULT_MAX_OFFLINE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10); const DEFAULT_MAX_WARMING_UP_INTERVAL: std::time::Duration = std::time::Duration::from_secs(30); + + // Very tight heartbeat interval to speed up tests + const DEFAULT_HEARTBEAT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(100); } impl Default for NeonStorageControllerConf { @@ -183,6 +189,7 @@ impl Default for NeonStorageControllerConf { database_url: None, split_threshold: None, max_secondary_lag_bytes: None, + heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL, } } } diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 399b1c2653..31777eb7a5 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -181,6 +181,23 @@ impl PageServerNode { ); io::stdout().flush()?; + // If the config file we got as a CLI argument includes the `availability_zone` + // config, then use that to populate the `metadata.json` file for the pageserver. + // In production the deployment orchestrator does this for us. + let az_id = conf + .other + .get("availability_zone") + .map(|toml| { + let az_str = toml.to_string(); + // Trim the (") chars from the toml representation + if az_str.starts_with('"') && az_str.ends_with('"') { + az_str[1..az_str.len() - 1].to_string() + } else { + az_str + } + }) + .unwrap_or("local".to_string()); + let config = self .pageserver_init_make_toml(conf) .context("make pageserver toml")?; @@ -216,6 +233,7 @@ impl PageServerNode { let (_http_host, http_port) = parse_host_port(&self.conf.listen_http_addr).expect("Unable to parse listen_http_addr"); let http_port = http_port.unwrap_or(9898); + // Intentionally hand-craft JSON: this acts as an implicit format compat test // in case the pageserver-side structure is edited, and reflects the real life // situation: the metadata is written by some other script. @@ -226,7 +244,10 @@ impl PageServerNode { postgres_port: self.pg_connection_config.port(), http_host: "localhost".to_string(), http_port, - other: HashMap::new(), + other: HashMap::from([( + "availability_zone_id".to_string(), + serde_json::json!(az_id), + )]), }) .unwrap(), ) diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs index a0a73f5609..573f1688d5 100644 --- a/control_plane/src/safekeeper.rs +++ b/control_plane/src/safekeeper.rs @@ -5,6 +5,7 @@ //! ```text //! .neon/safekeepers/ //! ``` +use std::future::Future; use std::io::Write; use std::path::PathBuf; use std::time::Duration; @@ -34,12 +35,10 @@ pub enum SafekeeperHttpError { type Result = result::Result; -#[async_trait::async_trait] -pub trait ResponseErrorMessageExt: Sized { - async fn error_from_body(self) -> Result; +pub(crate) trait ResponseErrorMessageExt: Sized { + fn error_from_body(self) -> impl Future> + Send; } -#[async_trait::async_trait] impl ResponseErrorMessageExt for reqwest::Response { async fn error_from_body(self) -> Result { let status = self.status(); diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 27d8e2de0c..c715d6b789 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -437,6 +437,8 @@ impl StorageController { &humantime::Duration::from(self.config.max_offline).to_string(), "--max-warming-up-interval", &humantime::Duration::from(self.config.max_warming_up).to_string(), + "--heartbeat-interval", + &humantime::Duration::from(self.config.heartbeat_interval).to_string(), "--address-for-peers", &address_for_peers.to_string(), ] diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 35510ccbca..5cce6cf3ae 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -41,6 +41,8 @@ enum Command { listen_http_addr: String, #[arg(long)] listen_http_port: u16, + #[arg(long)] + availability_zone_id: String, }, /// Modify a node's configuration in the storage controller @@ -322,6 +324,7 @@ async fn main() -> anyhow::Result<()> { listen_pg_port, listen_http_addr, listen_http_port, + availability_zone_id, } => { storcon_client .dispatch::<_, ()>( @@ -333,6 +336,7 @@ async fn main() -> anyhow::Result<()> { listen_pg_port, listen_http_addr, listen_http_port, + availability_zone_id: Some(availability_zone_id), }), ) .await?; diff --git a/docs/rfcs/037-storage-controller-restarts.md b/docs/rfcs/037-storage-controller-restarts.md new file mode 100644 index 0000000000..bad422344f --- /dev/null +++ b/docs/rfcs/037-storage-controller-restarts.md @@ -0,0 +1,259 @@ +# Rolling Storage Controller Restarts + +## Summary + +This RFC describes the issues around the current storage controller restart procedure +and describes an implementation which reduces downtime to a few milliseconds on the happy path. + +## Motivation + +Storage controller upgrades (restarts, more generally) can cause multi-second availability gaps. +While the storage controller does not sit on the main data path, it's generally not acceptable +to block management requests for extended periods of time (e.g. https://github.com/neondatabase/neon/issues/8034). + +### Current Implementation + +The storage controller runs in a Kubernetes Deployment configured for one replica and strategy set to [Recreate](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#recreate-deployment). +In non Kubernetes terms, during an upgrade, the currently running storage controller is stopped and, only after, +a new instance is created. + +At start-up, the storage controller calls into all the pageservers it manages (retrieved from DB) to learn the +latest locations of all tenant shards present on them. This is usually fast, but can push into tens of seconds +under unfavourable circumstances: pageservers are heavily loaded or unavailable. + +## Prior Art + +There's probably as many ways of handling restarts gracefully as there are distributed systems. Some examples include: +* Active/Standby architectures: Two or more instance of the same service run, but traffic is only routed to one of them. +For fail-over, traffic is routed to one of the standbys (which becomes active). +* Consensus Algorithms (Raft, Paxos and friends): The part of consensus we care about here is leader election: peers communicate to each other +and use a voting scheme that ensures the existence of a single leader (e.g. Raft epochs). + +## Requirements + +* Reduce storage controller unavailability during upgrades to milliseconds +* Minimize the interval in which it's possible for more than one storage controller +to issue reconciles. +* Have one uniform implementation for restarts and upgrades +* Fit in with the current Kubernetes deployment scheme + +## Non Goals + +* Implement our own consensus algorithm from scratch +* Completely eliminate downtime storage controller downtime. Instead we aim to reduce it to the point where it looks +like a transient error to the control plane + +## Impacted Components + +* storage controller +* deployment orchestration (i.e. Ansible) +* helm charts + +## Terminology + +* Observed State: in-memory mapping between tenant shards and their current pageserver locations - currently built up +at start-up by quering pageservers +* Deployment: Kubernetes [primitive](https://kubernetes.io/docs/concepts/workloads/controllers/deployment/) that models +a set of replicas + +## Implementation + +### High Level Flow + +At a very high level the proposed idea is to start a new storage controller instance while +the previous one is still running and cut-over to it when it becomes ready. The new instance, +should coordinate with the existing one and transition responsibility gracefully. While the controller +has built in safety against split-brain situations (via generation numbers), we'd like to avoid such +scenarios since they can lead to availability issues for tenants that underwent changes while two controllers +were operating at the same time and require operator intervention to remedy. + +### Kubernetes Deployment Configuration + +On the Kubernetes configuration side, the proposal is to update the storage controller `Deployment` +to use `spec.strategy.type = RollingUpdate`, `spec.strategy.rollingUpdate.maxSurge=1` and `spec.strategy.maxUnavailable=0`. +Under the hood, Kubernetes creates a new replica set and adds one pod to it (`maxSurge=1`). The old replica set does not +scale down until the new replica set has one replica in the ready state (`maxUnavailable=0`). + +The various possible failure scenarios are investigated in the [Handling Failures](#handling-failures) section. + +### Storage Controller Start-Up + +This section describes the primitives required on the storage controller side and the flow of the happy path. + +#### Database Table For Leader Synchronization + +A new table should be added to the storage controller database for leader synchronization during startup. +This table will always contain at most one row. The proposed name for the table is `leader` and the schema +contains two elements: +* `hostname`: represents the hostname for the current storage controller leader - should be addressible +from other pods in the deployment +* `start_timestamp`: holds the start timestamp for the current storage controller leader (UTC timezone) - only required +for failure case handling: see [Previous Leader Crashes Before New Leader Readiness](#previous-leader-crashes-before-new-leader-readiness) + +Storage controllers will read the leader row at start-up and then update it to mark themselves as the leader +at the end of the start-up sequence. We want compare-and-exchange semantics for the update: avoid the +situation where two concurrent updates succeed and overwrite each other. The default Postgres isolation +level is `READ COMMITTED`, which isn't strict enough here. This update transaction should use at least `REPEATABLE +READ` isolation level in order to [prevent lost updates](https://www.interdb.jp/pg/pgsql05/08.html). Currently, +the storage controller uses the stricter `SERIALIZABLE` isolation level for all transactions. This more than suits +our needs here. + +``` +START TRANSACTION ISOLATION LEVEL REPEATABLE READ +UPDATE leader SET hostname=, start_timestamp= +WHERE hostname=, start_timestampt=; +``` + +If the transaction fails or if no rows have been updated, then the compare-and-exchange is regarded as a failure. + +#### Step Down API + +A new HTTP endpoint should be added to the storage controller: `POST /control/v1/step_down`. Upon receiving this +request the leader cancels any pending reconciles and goes into a mode where it replies with 503 to all other APIs +and does not issue any location configurations to its pageservers. The successful HTTP response will return a serialized +snapshot of the observed state. + +If other step down requests come in after the initial one, the request is handled and the observed state is returned (required +for failure scenario handling - see [Handling Failures](#handling-failures)). + +#### Graceful Restart Happy Path + +At start-up, the first thing the storage controller does is retrieve the sole row from the new +`leader` table. If such an entry exists, send a `/step_down` PUT API call to the current leader. +This should be retried a few times with a short backoff (see [1]). The aspiring leader loads the +observed state into memory and the start-up sequence proceeds as usual, but *without* querying the +pageservers in order to build up the observed state. + +Before doing any reconciliations or persistence change, update the `leader` database table as described in the [Database Table For Leader Synchronization](database-table-for-leader-synchronization) +section. If this step fails, the storage controller process exits. + +Note that no row will exist in the `leaders` table for the first graceful restart. In that case, force update the `leader` table +(without the WHERE clause) and perform with the pre-existing start-up procedure (i.e. build observed state by querying pageservers). + +Summary of proposed new start-up sequence: +1. Call `/step_down` +2. Perform any pending database migrations +3. Load state from database +4. Load observed state returned in step (1) into memory +5. Do initial heartbeat round (may be moved after 5) +7. Mark self as leader by updating the database +8. Reschedule and reconcile everything + +Some things to note from the steps above: +* The storage controller makes no changes to the cluster state before step (5) (i.e. no location config +calls to the pageserver and no compute notifications) +* Ask the current leader to step down before loading state from database so we don't get a lost update +if the transactions overlap. +* Before loading the observed state at step (3), cross-validate against the database. If validation fails, +fall back to asking the pageservers about their current locations. +* Database migrations should only run **after** the previous instance steps down (or the step down times out). + + +[1] The API call might fail because there's no storage controller running (i.e. [restart](#storage-controller-crash-or-restart)), +so we don't want to extend the unavailability period by much. We still want to retry since that's not the common case. + +### Handling Failures + +#### Storage Controller Crash Or Restart + +The storage controller may crash or be restarted outside of roll-outs. When a new pod is created, its call to +`/step_down` will fail since the previous leader is no longer reachable. In this case perform the pre-existing +start-up procedure and update the leader table (with the WHERE clause). If the update fails, the storage controller +exists and consistency is maintained. + +#### Previous Leader Crashes Before New Leader Readiness + +When the previous leader (P1) crashes before the new leader (P2) passses the readiness check, Kubernetes will +reconcile the old replica set and create a new pod for it (P1'). The `/step_down` API call will fail for P1' +(see [2]). + +Now we have two cases to consider: +* P2 updates the `leader` table first: The database update from P1' will fail and P1' will exit, or be terminated +by Kubernetes depending on timings. +* P1' updates the `leader` table first: The `hostname` field of the `leader` row stays the same, but the `start_timestamp` field changes. +The database update from P2 will fail (since `start_timestamp` does not match). P2 will exit and Kubernetes will +create a new replacement pod for it (P2'). Now the entire dance starts again, but with P1' as the leader and P2' as the incumbent. + +[2] P1 and P1' may (more likely than not) be the same pod and have the same hostname. The implementation +should avoid this self reference and fail the API call at the client if the persisted hostname matches +the current one. + +#### Previous Leader Crashes After New Leader Readiness + +The deployment's replica sets already satisfy the deployment's replica count requirements and the +Kubernetes deployment rollout will just clean up the dead pod. + +#### New Leader Crashes Before Pasing Readiness Check + +The deployment controller scales up the new replica sets by creating a new pod. The entire procedure is repeated +with the new pod. + +#### Network Partition Between New Pod and Previous Leader + +This feels very unlikely, but should be considered in any case. P2 (the new aspiring leader) fails the `/step_down` +API call into P1 (the current leader). P2 proceeds with the pre-existing startup procedure and updates the `leader` table. +Kubernetes will terminate P1, but there may be a brief period where both storage controller can drive reconciles. + +### Dealing With Split Brain Scenarios + +As we've seen in the previous section, we can end up with two storage controller running at the same time. The split brain +duration is not bounded since the Kubernetes controller might become partitioned from the pods (unlikely though). While these +scenarios are not fatal, they can cause tenant unavailability, so we'd like to reduce the chances of this happening. +The rest of this section sketches some safety measure. It's likely overkill to implement all of them however. + +### Ensure Leadership Before Producing Side Effects + +The storage controller has two types of side effects: location config requests into pageservers and compute notifications into the control plane. +Before issuing either, the storage controller could check that it is indeed still the leader by querying the database. Side effects might still be +applied if they race with the database updatem, but the situation will eventually be detected. The storage controller process should terminate in these cases. + +### Leadership Lease + +Up until now, the leadership defined by this RFC is static. In order to bound the length of the split brain scenario, we could require the leadership +to be renewed periodically. Two new columns would be added to the leaders table: +1. `last_renewed` - timestamp indicating when the lease was last renewed +2. `lease_duration` - duration indicating the amount of time after which the lease expires + +The leader periodically attempts to renew the lease by checking that it is in fact still the legitimate leader and updating `last_renewed` in the +same transaction. If the update fails, the process exits. New storage controller instances wishing to become leaders must wait for the current lease +to expire before acquiring leadership if they have not succesfully received a response to the `/step_down` request. + +### Notify Pageserver Of Storage Controller Term + +Each time that leadership changes, we can bump a `term` integer column in the `leader` table. This term uniquely identifies a leader. +Location config requests and re-attach responses can include this term. On the pageserver side, keep the latest term in memory and refuse +anything which contains a stale term (i.e. smaller than the current one). + +### Observability + +* The storage controller should expose a metric which describes it's state (`Active | WarmingUp | SteppedDown`). +Per region alerts should be added on this metric which triggers when: + + no storage controller has been in the `Active` state for an extended period of time + + more than one storage controllers are in the `Active` state + +* An alert that periodically verifies that the `leader` table is in sync with the metric above would be very useful. +We'd have to expose the storage controller read only database to Grafana (perhaps it is already done). + +## Alternatives + +### Kubernetes Leases + +Kubernetes has a [lease primitive](https://kubernetes.io/docs/concepts/architecture/leases/) which can be used to implement leader election. +Only one instance may hold a lease at any given time. This lease needs to be periodically renewed and has an expiration period. + +In our case, it would work something like this: +* `/step_down` deletes the lease or stops it from renewing +* lease acquisition becomes part of the start-up procedure + +The kubert crate implements a [lightweight lease API](https://docs.rs/kubert/latest/kubert/lease/struct.LeaseManager.html), but it's still +not exactly trivial to implement. + +This approach has the benefit of baked in observability (`kubectl describe lease`), but: +* We offload the responsibility to Kubernetes which makes it harder to debug when things go wrong. +* More code surface than the simple "row in database" approach. Also, most of this code would be in +a dependency not subject to code review, etc. +* Hard to test. Our testing infra does not run the storage controller in Kubernetes and changing it do +so is not simple and complictes and the test set-up. + +To my mind, the "row in database" approach is straightforward enough that we don't have to offload this +to something external. diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml index cb28359ac3..8710904cec 100644 --- a/libs/pageserver_api/Cargo.toml +++ b/libs/pageserver_api/Cargo.toml @@ -4,6 +4,10 @@ version = "0.1.0" edition.workspace = true license.workspace = true +[features] +# See pageserver/Cargo.toml +testing = ["dep:nix"] + [dependencies] serde.workspace = true serde_with.workspace = true @@ -23,6 +27,12 @@ thiserror.workspace = true humantime-serde.workspace = true chrono = { workspace = true, features = ["serde"] } itertools.workspace = true +storage_broker.workspace = true +camino = {workspace = true, features = ["serde1"]} +remote_storage.workspace = true +postgres_backend.workspace = true +nix = {workspace = true, optional = true} +reqwest.workspace = true [dev-dependencies] bincode.workspace = true diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index d996a62349..b2662c562a 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -1,15 +1,28 @@ -use std::collections::HashMap; - -use const_format::formatcp; +use camino::Utf8PathBuf; #[cfg(test)] mod tests; +use const_format::formatcp; pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000; pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}"); pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898; pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}"); +use postgres_backend::AuthType; +use remote_storage::RemoteStorageConfig; +use serde_with::serde_as; +use std::{ + collections::HashMap, + num::{NonZeroU64, NonZeroUsize}, + str::FromStr, + time::Duration, +}; +use utils::logging::LogFormat; + +use crate::models::ImageCompressionAlgorithm; +use crate::models::LsnLease; + // Certain metadata (e.g. externally-addressable name, AZ) is delivered // as a separate structure. This information is not neeed by the pageserver // itself, it is only used for registering the pageserver with the control @@ -29,3 +42,511 @@ pub struct NodeMetadata { #[serde(flatten)] pub other: HashMap, } + +/// `pageserver.toml` +/// +/// We use serde derive with `#[serde(default)]` to generate a deserializer +/// that fills in the default values for each config field. +/// +/// If there cannot be a static default value because we need to make runtime +/// checks to determine the default, make it an `Option` (which defaults to None). +/// The runtime check should be done in the consuming crate, i.e., `pageserver`. +#[serde_as] +#[derive(Clone, Debug, serde::Deserialize, serde::Serialize)] +#[serde(default, deny_unknown_fields)] +pub struct ConfigToml { + // types mapped 1:1 into the runtime PageServerConfig type + pub listen_pg_addr: String, + pub listen_http_addr: String, + pub availability_zone: Option, + #[serde(with = "humantime_serde")] + pub wait_lsn_timeout: Duration, + #[serde(with = "humantime_serde")] + pub wal_redo_timeout: Duration, + pub superuser: String, + pub page_cache_size: usize, + pub max_file_descriptors: usize, + pub pg_distrib_dir: Option, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub http_auth_type: AuthType, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub pg_auth_type: AuthType, + pub auth_validation_public_key_path: Option, + pub remote_storage: Option, + pub tenant_config: TenantConfigToml, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub broker_endpoint: storage_broker::Uri, + #[serde(with = "humantime_serde")] + pub broker_keepalive_interval: Duration, + #[serde_as(as = "serde_with::DisplayFromStr")] + pub log_format: LogFormat, + pub concurrent_tenant_warmup: NonZeroUsize, + pub concurrent_tenant_size_logical_size_queries: NonZeroUsize, + #[serde(with = "humantime_serde")] + pub metric_collection_interval: Duration, + pub metric_collection_endpoint: Option, + pub metric_collection_bucket: Option, + #[serde(with = "humantime_serde")] + pub synthetic_size_calculation_interval: Duration, + pub disk_usage_based_eviction: Option, + pub test_remote_failures: u64, + pub ondemand_download_behavior_treat_error_as_warn: bool, + #[serde(with = "humantime_serde")] + pub background_task_maximum_delay: Duration, + pub control_plane_api: Option, + pub control_plane_api_token: Option, + pub control_plane_emergency_mode: bool, + pub heatmap_upload_concurrency: usize, + pub secondary_download_concurrency: usize, + pub virtual_file_io_engine: Option, + pub ingest_batch_size: u64, + pub max_vectored_read_bytes: MaxVectoredReadBytes, + pub image_compression: ImageCompressionAlgorithm, + pub ephemeral_bytes_per_memory_kb: usize, + pub l0_flush: Option, + pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess, + pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode, + pub io_buffer_alignment: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(deny_unknown_fields)] +pub struct DiskUsageEvictionTaskConfig { + pub max_usage_pct: utils::serde_percent::Percent, + pub min_avail_bytes: u64, + #[serde(with = "humantime_serde")] + pub period: Duration, + #[cfg(feature = "testing")] + pub mock_statvfs: Option, + /// Select sorting for evicted layers + #[serde(default)] + pub eviction_order: EvictionOrder, +} + +pub mod statvfs { + pub mod mock { + #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + #[serde(tag = "type")] + pub enum Behavior { + Success { + blocksize: u64, + total_blocks: u64, + name_filter: Option, + }, + #[cfg(feature = "testing")] + Failure { mocked_error: MockedError }, + } + + #[cfg(feature = "testing")] + #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] + #[allow(clippy::upper_case_acronyms)] + pub enum MockedError { + EIO, + } + + #[cfg(feature = "testing")] + impl From for nix::Error { + fn from(e: MockedError) -> Self { + match e { + MockedError::EIO => nix::Error::EIO, + } + } + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(tag = "type", content = "args")] +pub enum EvictionOrder { + RelativeAccessed { + highest_layer_count_loses_first: bool, + }, +} + +impl Default for EvictionOrder { + fn default() -> Self { + Self::RelativeAccessed { + highest_layer_count_loses_first: true, + } + } +} + +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +pub enum GetVectoredImpl { + Sequential, + Vectored, +} + +#[derive( + Eq, + PartialEq, + Debug, + Copy, + Clone, + strum_macros::EnumString, + strum_macros::Display, + serde_with::DeserializeFromStr, + serde_with::SerializeDisplay, +)] +#[strum(serialize_all = "kebab-case")] +pub enum GetImpl { + Legacy, + Vectored, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(transparent)] +pub struct MaxVectoredReadBytes(pub NonZeroUsize); + +#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] +#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] +pub enum CompactL0Phase1ValueAccess { + /// The old way. + PageCachedBlobIo, + /// The new way. + StreamingKmerge { + /// If set, we run both the old way and the new way, validate that + /// they are identical (=> [`CompactL0BypassPageCacheValidation`]), + /// and if the validation fails, + /// - in tests: fail them with a panic or + /// - in prod, log a rate-limited warning and use the old way's results. + /// + /// If not set, we only run the new way and trust its results. + validate: Option, + }, +} + +/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`]. +#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] +#[serde(rename_all = "kebab-case")] +pub enum CompactL0BypassPageCacheValidation { + /// Validate that the series of (key, lsn) pairs are the same. + KeyLsn, + /// Validate that the entire output of old and new way is identical. + KeyLsnValue, +} + +impl Default for CompactL0Phase1ValueAccess { + fn default() -> Self { + CompactL0Phase1ValueAccess::StreamingKmerge { + // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident + validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue), + } + } +} + +/// A tenant's calcuated configuration, which is the result of merging a +/// tenant's TenantConfOpt with the global TenantConf from PageServerConf. +/// +/// For storing and transmitting individual tenant's configuration, see +/// TenantConfOpt. +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +#[serde(deny_unknown_fields, default)] +pub struct TenantConfigToml { + // Flush out an inmemory layer, if it's holding WAL older than this + // This puts a backstop on how much WAL needs to be re-digested if the + // page server crashes. + // This parameter actually determines L0 layer file size. + pub checkpoint_distance: u64, + // Inmemory layer is also flushed at least once in checkpoint_timeout to + // eventually upload WAL after activity is stopped. + #[serde(with = "humantime_serde")] + pub checkpoint_timeout: Duration, + // Target file size, when creating image and delta layers. + // This parameter determines L1 layer file size. + pub compaction_target_size: u64, + // How often to check if there's compaction work to be done. + // Duration::ZERO means automatic compaction is disabled. + #[serde(with = "humantime_serde")] + pub compaction_period: Duration, + // Level0 delta layer threshold for compaction. + pub compaction_threshold: usize, + pub compaction_algorithm: crate::models::CompactionAlgorithmSettings, + // Determines how much history is retained, to allow + // branching and read replicas at an older point in time. + // The unit is #of bytes of WAL. + // Page versions older than this are garbage collected away. + pub gc_horizon: u64, + // Interval at which garbage collection is triggered. + // Duration::ZERO means automatic GC is disabled + #[serde(with = "humantime_serde")] + pub gc_period: Duration, + // Delta layer churn threshold to create L1 image layers. + pub image_creation_threshold: usize, + // Determines how much history is retained, to allow + // branching and read replicas at an older point in time. + // The unit is time. + // Page versions older than this are garbage collected away. + #[serde(with = "humantime_serde")] + pub pitr_interval: Duration, + /// Maximum amount of time to wait while opening a connection to receive wal, before erroring. + #[serde(with = "humantime_serde")] + pub walreceiver_connect_timeout: Duration, + /// Considers safekeepers stalled after no WAL updates were received longer than this threshold. + /// A stalled safekeeper will be changed to a newer one when it appears. + #[serde(with = "humantime_serde")] + pub lagging_wal_timeout: Duration, + /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold. + /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update, + /// to avoid eager reconnects. + pub max_lsn_wal_lag: NonZeroU64, + pub eviction_policy: crate::models::EvictionPolicy, + pub min_resident_size_override: Option, + // See the corresponding metric's help string. + #[serde(with = "humantime_serde")] + pub evictions_low_residence_duration_metric_threshold: Duration, + + /// If non-zero, the period between uploads of a heatmap from attached tenants. This + /// may be disabled if a Tenant will not have secondary locations: only secondary + /// locations will use the heatmap uploaded by attached locations. + #[serde(with = "humantime_serde")] + pub heatmap_period: Duration, + + /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup + pub lazy_slru_download: bool, + + pub timeline_get_throttle: crate::models::ThrottleConfig, + + // How much WAL must be ingested before checking again whether a new image layer is required. + // Expresed in multiples of checkpoint distance. + pub image_layer_creation_check_threshold: u8, + + /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into + /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions. + /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux + /// file is written. + pub switch_aux_file_policy: crate::models::AuxFilePolicy, + + /// The length for an explicit LSN lease request. + /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. + #[serde(with = "humantime_serde")] + pub lsn_lease_length: Duration, + + /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request. + /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. + #[serde(with = "humantime_serde")] + pub lsn_lease_length_for_ts: Duration, +} + +pub mod defaults { + use crate::models::ImageCompressionAlgorithm; + + pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; + + pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s"; + pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; + + pub const DEFAULT_SUPERUSER: &str = "cloud_admin"; + + pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; + pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; + + pub const DEFAULT_LOG_FORMAT: &str = "plain"; + + pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8; + + pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = 1; + + pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min"; + pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; + pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; + pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s"; + + pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8; + pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1; + + pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; + + pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB + + pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm = + ImageCompressionAlgorithm::Zstd { level: Some(1) }; + + pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false; + + pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; + + pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512; +} + +impl Default for ConfigToml { + fn default() -> Self { + use defaults::*; + + Self { + listen_pg_addr: (DEFAULT_PG_LISTEN_ADDR.to_string()), + listen_http_addr: (DEFAULT_HTTP_LISTEN_ADDR.to_string()), + availability_zone: (None), + wait_lsn_timeout: (humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT) + .expect("cannot parse default wait lsn timeout")), + wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT) + .expect("cannot parse default wal redo timeout")), + superuser: (DEFAULT_SUPERUSER.to_string()), + page_cache_size: (DEFAULT_PAGE_CACHE_SIZE), + max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS), + pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir() + http_auth_type: (AuthType::Trust), + pg_auth_type: (AuthType::Trust), + auth_validation_public_key_path: (None), + remote_storage: None, + broker_endpoint: (storage_broker::DEFAULT_ENDPOINT + .parse() + .expect("failed to parse default broker endpoint")), + broker_keepalive_interval: (humantime::parse_duration( + storage_broker::DEFAULT_KEEPALIVE_INTERVAL, + ) + .expect("cannot parse default keepalive interval")), + log_format: (LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), + + concurrent_tenant_warmup: (NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP) + .expect("Invalid default constant")), + concurrent_tenant_size_logical_size_queries: NonZeroUsize::new(1).unwrap(), + metric_collection_interval: (humantime::parse_duration( + DEFAULT_METRIC_COLLECTION_INTERVAL, + ) + .expect("cannot parse default metric collection interval")), + synthetic_size_calculation_interval: (humantime::parse_duration( + DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL, + ) + .expect("cannot parse default synthetic size calculation interval")), + metric_collection_endpoint: (DEFAULT_METRIC_COLLECTION_ENDPOINT), + + metric_collection_bucket: (None), + + disk_usage_based_eviction: (None), + + test_remote_failures: (0), + + ondemand_download_behavior_treat_error_as_warn: (false), + + background_task_maximum_delay: (humantime::parse_duration( + DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY, + ) + .unwrap()), + + control_plane_api: (None), + control_plane_api_token: (None), + control_plane_emergency_mode: (false), + + heatmap_upload_concurrency: (DEFAULT_HEATMAP_UPLOAD_CONCURRENCY), + secondary_download_concurrency: (DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY), + + ingest_batch_size: (DEFAULT_INGEST_BATCH_SIZE), + + virtual_file_io_engine: None, + + max_vectored_read_bytes: (MaxVectoredReadBytes( + NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), + )), + image_compression: (DEFAULT_IMAGE_COMPRESSION), + ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), + l0_flush: None, + compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), + virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(), + + io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT, + + tenant_config: TenantConfigToml::default(), + } + } +} + +pub mod tenant_conf_defaults { + + // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB + // would be more appropriate. But a low value forces the code to be exercised more, + // which is good for now to trigger bugs. + // This parameter actually determines L0 layer file size. + pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024; + pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m"; + + // FIXME the below configs are only used by legacy algorithm. The new algorithm + // has different parameters. + + // Target file size, when creating image and delta layers. + // This parameter determines L1 layer file size. + pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024; + + pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s"; + pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; + pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm = + crate::models::CompactionAlgorithm::Legacy; + + pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; + + // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger. + // If there's a need to decrease this value, first make sure that GC + // doesn't hold a layer map write lock for non-trivial operations. + // Relevant: https://github.com/neondatabase/neon/issues/3394 + pub const DEFAULT_GC_PERIOD: &str = "1 hr"; + pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; + pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; + pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds"; + pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; + // The default limit on WAL lag should be set to avoid causing disconnects under high throughput + // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for + // throughputs up to 1GiB/s per timeline. + pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024; + pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; + // By default ingest enough WAL for two new L0 layers before checking if new image + // image layers should be created. + pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; + + pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; +} + +impl Default for TenantConfigToml { + fn default() -> Self { + use tenant_conf_defaults::*; + Self { + checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE, + checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT) + .expect("cannot parse default checkpoint timeout"), + compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE, + compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) + .expect("cannot parse default compaction period"), + compaction_threshold: DEFAULT_COMPACTION_THRESHOLD, + compaction_algorithm: crate::models::CompactionAlgorithmSettings { + kind: DEFAULT_COMPACTION_ALGORITHM, + }, + gc_horizon: DEFAULT_GC_HORIZON, + gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) + .expect("cannot parse default gc period"), + image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD, + pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL) + .expect("cannot parse default PITR interval"), + walreceiver_connect_timeout: humantime::parse_duration( + DEFAULT_WALRECEIVER_CONNECT_TIMEOUT, + ) + .expect("cannot parse default walreceiver connect timeout"), + lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT) + .expect("cannot parse default walreceiver lagging wal timeout"), + max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) + .expect("cannot parse default max walreceiver Lsn wal lag"), + eviction_policy: crate::models::EvictionPolicy::NoEviction, + min_resident_size_override: None, + evictions_low_residence_duration_metric_threshold: humantime::parse_duration( + DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, + ) + .expect("cannot parse default evictions_low_residence_duration_metric_threshold"), + heatmap_period: Duration::ZERO, + lazy_slru_download: false, + timeline_get_throttle: crate::models::ThrottleConfig::disabled(), + image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, + switch_aux_file_policy: crate::models::AuxFilePolicy::default_tenant_config(), + lsn_lease_length: LsnLease::DEFAULT_LENGTH, + lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, + } + } +} diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index a9a57d77ce..345abd69b6 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -56,6 +56,8 @@ pub struct NodeRegisterRequest { pub listen_http_addr: String, pub listen_http_port: u16, + + pub availability_zone_id: Option, } #[derive(Serialize, Deserialize)] diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 77da58d63e..77d744e4da 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -108,14 +108,41 @@ impl Key { } } + /// This function checks more extensively what keys we can take on the write path. + /// If a key beginning with 00 does not have a global/default tablespace OID, it + /// will be rejected on the write path. + #[allow(dead_code)] + pub fn is_valid_key_on_write_path_strong(&self) -> bool { + use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID}; + if !self.is_i128_representable() { + return false; + } + if self.field1 == 0 + && !(self.field2 == GLOBALTABLESPACE_OID + || self.field2 == DEFAULTTABLESPACE_OID + || self.field2 == 0) + { + return false; // User defined tablespaces are not supported + } + true + } + + /// This is a weaker version of `is_valid_key_on_write_path_strong` that simply + /// checks if the key is i128 representable. Note that some keys can be successfully + /// ingested into the pageserver, but will cause errors on generating basebackup. + pub fn is_valid_key_on_write_path(&self) -> bool { + self.is_i128_representable() + } + + pub fn is_i128_representable(&self) -> bool { + self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222 + } + /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish. /// As long as Neon does not support tablespace (because of lack of access to local file system), /// we can assume that only some predefined namespace OIDs are used which can fit in u16 pub fn to_i128(&self) -> i128 { - assert!( - self.field2 <= 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222, - "invalid key: {self}", - ); + assert!(self.is_i128_representable(), "invalid key: {self}"); (((self.field1 & 0x7F) as i128) << 120) | (((self.field2 & 0xFFFF) as i128) << 104) | ((self.field3 as i128) << 72) diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index d39ac75707..d13d04eb1b 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -6,8 +6,9 @@ pub use utilization::PageserverUtilization; use std::{ collections::HashMap, + fmt::Display, io::{BufRead, Read}, - num::{NonZeroU64, NonZeroUsize}, + num::{NonZeroU32, NonZeroU64, NonZeroUsize}, str::FromStr, sync::atomic::AtomicUsize, time::{Duration, SystemTime}, @@ -435,7 +436,9 @@ pub enum CompactionAlgorithm { Tiered, } -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive( + Debug, Clone, Copy, PartialEq, Eq, serde_with::DeserializeFromStr, serde_with::SerializeDisplay, +)] pub enum ImageCompressionAlgorithm { // Disabled for writes, support decompressing during read path Disabled, @@ -470,11 +473,33 @@ impl FromStr for ImageCompressionAlgorithm { } } +impl Display for ImageCompressionAlgorithm { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ImageCompressionAlgorithm::Disabled => write!(f, "disabled"), + ImageCompressionAlgorithm::Zstd { level } => { + if let Some(level) = level { + write!(f, "zstd({})", level) + } else { + write!(f, "zstd") + } + } + } + } +} + #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)] pub struct CompactionAlgorithmSettings { pub kind: CompactionAlgorithm, } +#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] +#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] +pub enum L0FlushConfig { + #[serde(rename_all = "snake_case")] + Direct { max_concurrency: NonZeroUsize }, +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub struct EvictionPolicyLayerAccessThreshold { #[serde(with = "humantime_serde")] @@ -486,12 +511,11 @@ pub struct EvictionPolicyLayerAccessThreshold { #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] pub struct ThrottleConfig { pub task_kinds: Vec, // TaskKind - pub initial: usize, + pub initial: u32, #[serde(with = "humantime_serde")] pub refill_interval: Duration, - pub refill_amount: NonZeroUsize, - pub max: usize, - pub fair: bool, + pub refill_amount: NonZeroU32, + pub max: u32, } impl ThrottleConfig { @@ -501,9 +525,8 @@ impl ThrottleConfig { // other values don't matter with emtpy `task_kinds`. initial: 0, refill_interval: Duration::from_millis(1), - refill_amount: NonZeroUsize::new(1).unwrap(), + refill_amount: NonZeroU32::new(1).unwrap(), max: 1, - fair: true, } } /// The requests per second allowed by the given config. @@ -718,12 +741,17 @@ pub struct TimelineInfo { pub pg_version: u32, pub state: TimelineState, - pub is_archived: bool, pub walreceiver_status: String, + // ALWAYS add new fields at the end of the struct with `Option` to ensure forward/backward compatibility. + // Backward compatibility: you will get a JSON not containing the newly-added field. + // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does + // not deny unknown fields by default so it's safe to set the field to some value, though it won't be + // read. /// The last aux file policy being used on this timeline pub last_aux_file_policy: Option, + pub is_archived: Option, } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -1653,21 +1681,33 @@ mod tests { #[test] fn test_image_compression_algorithm_parsing() { use ImageCompressionAlgorithm::*; - assert_eq!( - ImageCompressionAlgorithm::from_str("disabled").unwrap(), - Disabled - ); - assert_eq!( - ImageCompressionAlgorithm::from_str("zstd").unwrap(), - Zstd { level: None } - ); - assert_eq!( - ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(), - Zstd { level: Some(18) } - ); - assert_eq!( - ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(), - Zstd { level: Some(-3) } - ); + let cases = [ + ("disabled", Disabled), + ("zstd", Zstd { level: None }), + ("zstd(18)", Zstd { level: Some(18) }), + ("zstd(-3)", Zstd { level: Some(-3) }), + ]; + + for (display, expected) in cases { + assert_eq!( + ImageCompressionAlgorithm::from_str(display).unwrap(), + expected, + "parsing works" + ); + assert_eq!(format!("{expected}"), display, "Display FromStr roundtrip"); + + let ser = serde_json::to_string(&expected).expect("serialization"); + assert_eq!( + serde_json::from_str::(&ser).unwrap(), + expected, + "serde roundtrip" + ); + + assert_eq!( + serde_json::Value::String(display.to_string()), + serde_json::to_value(expected).unwrap(), + "Display is the serde serialization" + ); + } } } diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 0940ad207f..9acb105e9b 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -136,9 +136,9 @@ pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16; // Export some version independent functions that are used outside of this mod pub use v14::xlog_utils::encode_logical_message; -pub use v14::xlog_utils::from_pg_timestamp; pub use v14::xlog_utils::get_current_timestamp; pub use v14::xlog_utils::to_pg_timestamp; +pub use v14::xlog_utils::try_from_pg_timestamp; pub use v14::xlog_utils::XLogFileName; pub use v14::bindings::DBState_DB_SHUTDOWNED; diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 9fe7e8198b..0cfd56962e 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -135,6 +135,8 @@ pub fn get_current_timestamp() -> TimestampTz { mod timestamp_conversions { use std::time::Duration; + use anyhow::Context; + use super::*; const UNIX_EPOCH_JDATE: u64 = 2440588; // == date2j(1970, 1, 1) @@ -154,18 +156,18 @@ mod timestamp_conversions { } } - pub fn from_pg_timestamp(time: TimestampTz) -> SystemTime { + pub fn try_from_pg_timestamp(time: TimestampTz) -> anyhow::Result { let time: u64 = time .try_into() - .expect("timestamp before millenium (postgres epoch)"); + .context("timestamp before millenium (postgres epoch)")?; let since_unix_epoch = time + SECS_DIFF_UNIX_TO_POSTGRES_EPOCH * USECS_PER_SEC; SystemTime::UNIX_EPOCH .checked_add(Duration::from_micros(since_unix_epoch)) - .expect("SystemTime overflow") + .context("SystemTime overflow") } } -pub use timestamp_conversions::{from_pg_timestamp, to_pg_timestamp}; +pub use timestamp_conversions::{to_pg_timestamp, try_from_pg_timestamp}; // Returns (aligned) end_lsn of the last record in data_dir with WAL segments. // start_lsn must point to some previously known record boundary (beginning of @@ -545,14 +547,14 @@ mod tests { #[test] fn test_ts_conversion() { let now = SystemTime::now(); - let round_trip = from_pg_timestamp(to_pg_timestamp(now)); + let round_trip = try_from_pg_timestamp(to_pg_timestamp(now)).unwrap(); let now_since = now.duration_since(SystemTime::UNIX_EPOCH).unwrap(); let round_trip_since = round_trip.duration_since(SystemTime::UNIX_EPOCH).unwrap(); assert_eq!(now_since.as_micros(), round_trip_since.as_micros()); let now_pg = get_current_timestamp(); - let round_trip_pg = to_pg_timestamp(from_pg_timestamp(now_pg)); + let round_trip_pg = to_pg_timestamp(try_from_pg_timestamp(now_pg).unwrap()); assert_eq!(now_pg, round_trip_pg); } diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index fa3f2cba58..f819a1572a 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -235,6 +235,31 @@ timeout = '5s'"; ); } + #[test] + fn test_storage_class_serde_roundtrip() { + let classes = [ + None, + Some(StorageClass::Standard), + Some(StorageClass::IntelligentTiering), + ]; + for class in classes { + #[derive(Serialize, Deserialize)] + struct Wrapper { + #[serde( + deserialize_with = "deserialize_storage_class", + serialize_with = "serialize_storage_class" + )] + class: Option, + } + let wrapped = Wrapper { + class: class.clone(), + }; + let serialized = serde_json::to_string(&wrapped).unwrap(); + let deserialized: Wrapper = serde_json::from_str(&serialized).unwrap(); + assert_eq!(class, deserialized.class); + } + } + #[test] fn test_azure_parsing() { let toml = "\ diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 6e593eeac1..19deaab63f 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -14,7 +14,6 @@ testing = ["fail/failpoints"] arc-swap.workspace = true sentry.workspace = true async-compression.workspace = true -async-trait.workspace = true anyhow.workspace = true bincode.workspace = true bytes.workspace = true @@ -26,7 +25,6 @@ hyper = { workspace = true, features = ["full"] } fail.workspace = true futures = { workspace = true} jsonwebtoken.workspace = true -leaky-bucket.workspace = true nix.workspace = true once_cell.workspace = true pin-project-lite.workspace = true diff --git a/libs/utils/src/leaky_bucket.rs b/libs/utils/src/leaky_bucket.rs new file mode 100644 index 0000000000..a120dc0ac5 --- /dev/null +++ b/libs/utils/src/leaky_bucket.rs @@ -0,0 +1,280 @@ +//! This module implements the Generic Cell Rate Algorithm for a simplified +//! version of the Leaky Bucket rate limiting system. +//! +//! # Leaky Bucket +//! +//! If the bucket is full, no new requests are allowed and are throttled/errored. +//! If the bucket is partially full/empty, new requests are added to the bucket in +//! terms of "tokens". +//! +//! Over time, tokens are removed from the bucket, naturally allowing new requests at a steady rate. +//! +//! The bucket size tunes the burst support. The drain rate tunes the steady-rate requests per second. +//! +//! # [GCRA](https://en.wikipedia.org/wiki/Generic_cell_rate_algorithm) +//! +//! GCRA is a continuous rate leaky-bucket impl that stores minimal state and requires +//! no background jobs to drain tokens, as the design utilises timestamps to drain automatically over time. +//! +//! We store an "empty_at" timestamp as the only state. As time progresses, we will naturally approach +//! the empty state. The full-bucket state is calculated from `empty_at - config.bucket_width`. +//! +//! Another explaination can be found here: + +use std::{sync::Mutex, time::Duration}; + +use tokio::{sync::Notify, time::Instant}; + +pub struct LeakyBucketConfig { + /// This is the "time cost" of a single request unit. + /// Should loosely represent how long it takes to handle a request unit in active resource time. + /// Loosely speaking this is the inverse of the steady-rate requests-per-second + pub cost: Duration, + + /// total size of the bucket + pub bucket_width: Duration, +} + +impl LeakyBucketConfig { + pub fn new(rps: f64, bucket_size: f64) -> Self { + let cost = Duration::from_secs_f64(rps.recip()); + let bucket_width = cost.mul_f64(bucket_size); + Self { cost, bucket_width } + } +} + +pub struct LeakyBucketState { + /// Bucket is represented by `allow_at..empty_at` where `allow_at = empty_at - config.bucket_width`. + /// + /// At any given time, `empty_at - now` represents the number of tokens in the bucket, multiplied by the "time_cost". + /// Adding `n` tokens to the bucket is done by moving `empty_at` forward by `n * config.time_cost`. + /// If `now < allow_at`, the bucket is considered filled and cannot accept any more tokens. + /// Draining the bucket will happen naturally as `now` moves forward. + /// + /// Let `n` be some "time cost" for the request, + /// If now is after empty_at, the bucket is empty and the empty_at is reset to now, + /// If now is within the `bucket window + n`, we are within time budget. + /// If now is before the `bucket window + n`, we have run out of budget. + /// + /// This is inspired by the generic cell rate algorithm (GCRA) and works + /// exactly the same as a leaky-bucket. + pub empty_at: Instant, +} + +impl LeakyBucketState { + pub fn with_initial_tokens(config: &LeakyBucketConfig, initial_tokens: f64) -> Self { + LeakyBucketState { + empty_at: Instant::now() + config.cost.mul_f64(initial_tokens), + } + } + + pub fn bucket_is_empty(&self, now: Instant) -> bool { + // if self.end is after now, the bucket is not empty + self.empty_at <= now + } + + /// Immediately adds tokens to the bucket, if there is space. + /// + /// In a scenario where you are waiting for available rate, + /// rather than just erroring immediately, `started` corresponds to when this waiting started. + /// + /// `n` is the number of tokens that will be filled in the bucket. + /// + /// # Errors + /// + /// If there is not enough space, no tokens are added. Instead, an error is returned with the time when + /// there will be space again. + pub fn add_tokens( + &mut self, + config: &LeakyBucketConfig, + started: Instant, + n: f64, + ) -> Result<(), Instant> { + let now = Instant::now(); + + // invariant: started <= now + debug_assert!(started <= now); + + // If the bucket was empty when we started our search, + // we should update the `empty_at` value accordingly. + // this prevents us from having negative tokens in the bucket. + let mut empty_at = self.empty_at; + if empty_at < started { + empty_at = started; + } + + let n = config.cost.mul_f64(n); + let new_empty_at = empty_at + n; + let allow_at = new_empty_at.checked_sub(config.bucket_width); + + // empty_at + // allow_at | new_empty_at + // / | / + // -------o-[---------o-|--]--------- + // now1 ^ now2 ^ + // + // at now1, the bucket would be completely filled if we add n tokens. + // at now2, the bucket would be partially filled if we add n tokens. + + match allow_at { + Some(allow_at) if now < allow_at => Err(allow_at), + _ => { + self.empty_at = new_empty_at; + Ok(()) + } + } + } +} + +pub struct RateLimiter { + pub config: LeakyBucketConfig, + pub state: Mutex, + /// a queue to provide this fair ordering. + pub queue: Notify, +} + +struct Requeue<'a>(&'a Notify); + +impl Drop for Requeue<'_> { + fn drop(&mut self) { + self.0.notify_one(); + } +} + +impl RateLimiter { + pub fn with_initial_tokens(config: LeakyBucketConfig, initial_tokens: f64) -> Self { + RateLimiter { + state: Mutex::new(LeakyBucketState::with_initial_tokens( + &config, + initial_tokens, + )), + config, + queue: { + let queue = Notify::new(); + queue.notify_one(); + queue + }, + } + } + + pub fn steady_rps(&self) -> f64 { + self.config.cost.as_secs_f64().recip() + } + + /// returns true if we did throttle + pub async fn acquire(&self, count: usize) -> bool { + let mut throttled = false; + + let start = tokio::time::Instant::now(); + + // wait until we are the first in the queue + let mut notified = std::pin::pin!(self.queue.notified()); + if !notified.as_mut().enable() { + throttled = true; + notified.await; + } + + // notify the next waiter in the queue when we are done. + let _guard = Requeue(&self.queue); + + loop { + let res = self + .state + .lock() + .unwrap() + .add_tokens(&self.config, start, count as f64); + match res { + Ok(()) => return throttled, + Err(ready_at) => { + throttled = true; + tokio::time::sleep_until(ready_at).await; + } + } + } + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use tokio::time::Instant; + + use super::{LeakyBucketConfig, LeakyBucketState}; + + #[tokio::test(start_paused = true)] + async fn check() { + let config = LeakyBucketConfig { + // average 100rps + cost: Duration::from_millis(10), + // burst up to 100 requests + bucket_width: Duration::from_millis(1000), + }; + + let mut state = LeakyBucketState { + empty_at: Instant::now(), + }; + + // supports burst + { + // should work for 100 requests this instant + for _ in 0..100 { + state.add_tokens(&config, Instant::now(), 1.0).unwrap(); + } + let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); + assert_eq!(ready - Instant::now(), Duration::from_millis(10)); + } + + // doesn't overfill + { + // after 1s we should have an empty bucket again. + tokio::time::advance(Duration::from_secs(1)).await; + assert!(state.bucket_is_empty(Instant::now())); + + // after 1s more, we should not over count the tokens and allow more than 200 requests. + tokio::time::advance(Duration::from_secs(1)).await; + for _ in 0..100 { + state.add_tokens(&config, Instant::now(), 1.0).unwrap(); + } + let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); + assert_eq!(ready - Instant::now(), Duration::from_millis(10)); + } + + // supports sustained rate over a long period + { + tokio::time::advance(Duration::from_secs(1)).await; + + // should sustain 100rps + for _ in 0..2000 { + tokio::time::advance(Duration::from_millis(10)).await; + state.add_tokens(&config, Instant::now(), 1.0).unwrap(); + } + } + + // supports requesting more tokens than can be stored in the bucket + // we just wait a little bit longer upfront. + { + // start the bucket completely empty + tokio::time::advance(Duration::from_secs(5)).await; + assert!(state.bucket_is_empty(Instant::now())); + + // requesting 200 tokens of space should take 200*cost = 2s + // but we already have 1s available, so we wait 1s from start. + let start = Instant::now(); + + let ready = state.add_tokens(&config, start, 200.0).unwrap_err(); + assert_eq!(ready - Instant::now(), Duration::from_secs(1)); + + tokio::time::advance(Duration::from_millis(500)).await; + let ready = state.add_tokens(&config, start, 200.0).unwrap_err(); + assert_eq!(ready - Instant::now(), Duration::from_millis(500)); + + tokio::time::advance(Duration::from_millis(500)).await; + state.add_tokens(&config, start, 200.0).unwrap(); + + // bucket should be completely full now + let ready = state.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); + assert_eq!(ready - Instant::now(), Duration::from_millis(10)); + } + } +} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index f4fc0ba57b..218dd468b1 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -71,6 +71,7 @@ pub mod postgres_client; pub mod tracing_span_assert; +pub mod leaky_bucket; pub mod rate_limit; /// Simple once-barrier and a guard which keeps barrier awaiting. diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs index f7b73dc984..71af43a4da 100644 --- a/libs/utils/src/logging.rs +++ b/libs/utils/src/logging.rs @@ -5,7 +5,9 @@ use metrics::{IntCounter, IntCounterVec}; use once_cell::sync::Lazy; use strum_macros::{EnumString, EnumVariantNames}; -#[derive(EnumString, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy)] +#[derive( + EnumString, strum_macros::Display, EnumVariantNames, Eq, PartialEq, Debug, Clone, Copy, +)] #[strum(serialize_all = "snake_case")] pub enum LogFormat { Plain, @@ -274,6 +276,14 @@ impl From for SecretString { } } +impl FromStr for SecretString { + type Err = std::convert::Infallible; + + fn from_str(s: &str) -> Result { + Ok(Self(s.to_string())) + } +} + impl std::fmt::Debug for SecretString { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "[SECRET]") diff --git a/libs/utils/src/rate_limit.rs b/libs/utils/src/rate_limit.rs index 557955bb88..f3f8f219e3 100644 --- a/libs/utils/src/rate_limit.rs +++ b/libs/utils/src/rate_limit.rs @@ -5,6 +5,15 @@ use std::time::{Duration, Instant}; pub struct RateLimit { last: Option, interval: Duration, + dropped: u64, +} + +pub struct RateLimitStats(u64); + +impl std::fmt::Display for RateLimitStats { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{} dropped calls", self.0) + } } impl RateLimit { @@ -12,20 +21,27 @@ impl RateLimit { Self { last: None, interval, + dropped: 0, } } /// Call `f` if the rate limit allows. /// Don't call it otherwise. pub fn call(&mut self, f: F) { + self.call2(|_| f()) + } + + pub fn call2(&mut self, f: F) { let now = Instant::now(); match self.last { Some(last) if now - last <= self.interval => { // ratelimit + self.dropped += 1; } _ => { self.last = Some(now); - f(); + f(RateLimitStats(self.dropped)); + self.dropped = 0; } } } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 0e748ee3db..24373afca3 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -8,7 +8,7 @@ license.workspace = true default = [] # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, # which adds some runtime cost to run tests on outage conditions -testing = ["fail/failpoints"] +testing = ["fail/failpoints", "pageserver_api/testing" ] [dependencies] anyhow.workspace = true @@ -16,6 +16,7 @@ arc-swap.workspace = true async-compression.workspace = true async-stream.workspace = true async-trait.workspace = true +bit_field.workspace = true byteorder.workspace = true bytes.workspace = true camino.workspace = true @@ -36,7 +37,6 @@ humantime.workspace = true humantime-serde.workspace = true hyper.workspace = true itertools.workspace = true -leaky-bucket.workspace = true md5.workspace = true nix.workspace = true # hack to get the number of worker threads tokio uses @@ -52,6 +52,7 @@ rand.workspace = true range-set-blaze = { version = "0.1.16", features = ["alloc"] } regex.workspace = true scopeguard.workspace = true +send-future.workspace = true serde.workspace = true serde_json = { workspace = true, features = ["raw_value"] } serde_path_to_error.workspace = true @@ -100,6 +101,7 @@ procfs.workspace = true criterion.workspace = true hex-literal.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] } +indoc.workspace = true [[bench]] name = "bench_layer_map" diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index bd99f5289d..72cbb6beab 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -103,13 +103,13 @@ async fn ingest( batch.push((key.to_compact(), lsn, data_ser_size, data.clone())); if batch.len() >= BATCH_SIZE { let this_batch = std::mem::take(&mut batch); - let serialized = SerializedBatch::from_values(this_batch); + let serialized = SerializedBatch::from_values(this_batch).unwrap(); layer.put_batch(serialized, &ctx).await?; } } if !batch.is_empty() { let this_batch = std::mem::take(&mut batch); - let serialized = SerializedBatch::from_values(this_batch); + let serialized = SerializedBatch::from_values(this_batch).unwrap(); layer.put_batch(serialized, &ctx).await?; } layer.freeze(lsn + 1).await; @@ -164,7 +164,11 @@ fn criterion_benchmark(c: &mut Criterion) { let conf: &'static PageServerConf = Box::leak(Box::new( pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()), )); - virtual_file::init(16384, virtual_file::io_engine_for_bench()); + virtual_file::init( + 16384, + virtual_file::io_engine_for_bench(), + pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, + ); page_cache::init(conf.page_cache_size); { diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml index a938367334..d9b36bf3d4 100644 --- a/pageserver/client/Cargo.toml +++ b/pageserver/client/Cargo.toml @@ -7,7 +7,6 @@ license.workspace = true [dependencies] pageserver_api.workspace = true thiserror.workspace = true -async-trait.workspace = true reqwest = { workspace = true, features = [ "stream" ] } utils.workspace = true serde.workspace = true diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index ac3ff1bb89..737cb00835 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -419,6 +419,24 @@ impl Client { } } + pub async fn timeline_archival_config( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + req: &TimelineArchivalConfigRequest, + ) -> Result<()> { + let uri = format!( + "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config", + self.mgmt_api_endpoint + ); + + self.request(Method::POST, &uri, req) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn timeline_detach_ancestor( &self, tenant_shard_id: TenantShardId, @@ -506,6 +524,16 @@ impl Client { .map_err(Error::ReceiveBody) } + /// Configs io buffer alignment at runtime. + pub async fn put_io_alignment(&self, align: usize) -> Result<()> { + let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint); + self.request(Method::PUT, uri, align) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + pub async fn get_utilization(&self) -> Result { let uri = format!("{}/v1/utilization", self.mgmt_api_endpoint); self.get(uri) diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index b4bb239f44..a07107753e 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -144,7 +144,11 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> { let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree. - pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); + pageserver::virtual_file::init( + 10, + virtual_file::api::IoEngineKind::StdFs, + pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, + ); pageserver::page_cache::init(100); let mut total_delta_layers = 0usize; diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index 3611b0baab..dd753398e2 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -59,7 +59,7 @@ pub(crate) enum LayerCmd { async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result<()> { let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path"); - virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); + virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1); page_cache::init(100); let file = VirtualFile::open(path, ctx).await?; let file_id = page_cache::next_file_id(); @@ -89,6 +89,7 @@ async fn read_delta_file(path: impl AsRef, ctx: &RequestContext) -> Result for (k, v) in all { let value = cursor.read_blob(v.pos(), ctx).await?; println!("key:{} value_len:{}", k, value.len()); + assert!(k.is_i128_representable(), "invalid key: "); } // TODO(chi): special handling for last key? Ok(()) @@ -189,7 +190,11 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> { new_tenant_id, new_timeline_id, } => { - pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); + pageserver::virtual_file::init( + 10, + virtual_file::api::IoEngineKind::StdFs, + pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, + ); pageserver::page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 3fabf62987..3b66b0c4aa 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -26,7 +26,7 @@ use pageserver::{ tenant::{dump_layerfile_from_path, metadata::TimelineMetadata}, virtual_file, }; -use pageserver_api::shard::TenantShardId; +use pageserver_api::{config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, shard::TenantShardId}; use postgres_ffi::ControlFileData; use remote_storage::{RemotePath, RemoteStorageConfig}; use tokio_util::sync::CancellationToken; @@ -205,7 +205,11 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> { async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> { // Basic initialization of things that don't change after startup - virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs); + virtual_file::init( + 10, + virtual_file::api::IoEngineKind::StdFs, + DEFAULT_IO_BUFFER_ALIGNMENT, + ); page_cache::init(100); let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error); dump_layerfile_from_path(path, true, &ctx).await diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 4992f37465..ac4a732377 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -58,6 +58,11 @@ pub(crate) struct Args { /// [`pageserver_api::models::virtual_file::IoEngineKind`]. #[clap(long)] set_io_engine: Option, + + /// Before starting the benchmark, live-reconfigure the pageserver to use specified alignment for io buffers. + #[clap(long)] + set_io_alignment: Option, + targets: Option>, } @@ -124,6 +129,10 @@ async fn main_impl( mgmt_api_client.put_io_engine(engine_str).await?; } + if let Some(align) = args.set_io_alignment { + mgmt_api_client.put_io_alignment(align).await?; + } + // discover targets let timelines: Vec = crate::util::cli::targets::discover( &mgmt_api_client, diff --git a/pageserver/src/assert_u64_eq_usize.rs b/pageserver/src/assert_u64_eq_usize.rs new file mode 100644 index 0000000000..66ca7fd057 --- /dev/null +++ b/pageserver/src/assert_u64_eq_usize.rs @@ -0,0 +1,39 @@ +//! `u64`` and `usize`` aren't guaranteed to be identical in Rust, but life is much simpler if that's the case. + +pub(crate) const _ASSERT_U64_EQ_USIZE: () = { + if std::mem::size_of::() != std::mem::size_of::() { + panic!("the traits defined in this module assume that usize and u64 can be converted to each other without loss of information"); + } +}; + +pub(crate) trait U64IsUsize { + fn into_usize(self) -> usize; +} + +impl U64IsUsize for u64 { + #[inline(always)] + fn into_usize(self) -> usize { + #[allow(clippy::let_unit_value)] + let _ = _ASSERT_U64_EQ_USIZE; + self as usize + } +} + +pub(crate) trait UsizeIsU64 { + fn into_u64(self) -> u64; +} + +impl UsizeIsU64 for usize { + #[inline(always)] + fn into_u64(self) -> u64 { + #[allow(clippy::let_unit_value)] + let _ = _ASSERT_U64_EQ_USIZE; + self as u64 + } +} + +pub const fn u64_to_usize(x: u64) -> usize { + #[allow(clippy::let_unit_value)] + let _ = _ASSERT_U64_EQ_USIZE; + x as usize +} diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 7d404e50a5..2c60e8d7d1 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -5,6 +5,7 @@ use std::env; use std::env::{var, VarError}; use std::io::Read; +use std::str::FromStr; use std::sync::Arc; use std::time::Duration; @@ -125,6 +126,7 @@ fn main() -> anyhow::Result<()> { info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine"); info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings"); info!(?conf.compact_level0_phase1_value_access, "starting with setting for compact_level0_phase1_value_access"); + info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment"); // The tenants directory contains all the pageserver local disk state. // Create if not exists and make sure all the contents are durable before proceeding. @@ -182,7 +184,11 @@ fn main() -> anyhow::Result<()> { let scenario = failpoint_support::init(); // Basic initialization of things that don't change after startup - virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine); + virtual_file::init( + conf.max_file_descriptors, + conf.virtual_file_io_engine, + conf.io_buffer_alignment, + ); page_cache::init(conf.page_cache_size); start_pageserver(launch_ts, conf).context("Failed to start pageserver")?; @@ -218,27 +224,15 @@ fn initialize_config( } }; - let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) { - Ok(mut f) => { - let md = f.metadata().context("stat config file")?; - if md.is_file() { - let mut s = String::new(); - f.read_to_string(&mut s).context("read config file")?; - s.parse().context("parse config file toml")? - } else { - anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}"); - } - } - Err(e) => { - anyhow::bail!("open pageserver config: {e}: {cfg_file_path}"); - } - }; - - debug!("Using pageserver toml: {config}"); - - // Construct the runtime representation - let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir) - .context("Failed to parse pageserver configuration")?; + let config_file_contents = + std::fs::read_to_string(cfg_file_path).context("read config file from filesystem")?; + let config_toml = serde_path_to_error::deserialize( + toml_edit::de::Deserializer::from_str(&config_file_contents) + .context("build toml deserializer")?, + ) + .context("deserialize config toml")?; + let conf = PageServerConf::parse_and_validate(identity.id, config_toml, workdir) + .context("runtime-validation of config toml")?; Ok(Box::leak(Box::new(conf))) } diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 0ebaf78840..c159b66905 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -4,11 +4,13 @@ //! file, or on the command line. //! See also `settings.md` for better description on every parameter. -use anyhow::{anyhow, bail, ensure, Context, Result}; -use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId}; +use anyhow::{bail, ensure, Context}; +use pageserver_api::models::ImageCompressionAlgorithm; +use pageserver_api::{ + config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes}, + shard::TenantShardId, +}; use remote_storage::{RemotePath, RemoteStorageConfig}; -use serde::de::IntoDeserializer; -use serde::{self, Deserialize}; use std::env; use storage_broker::Uri; use utils::crashsafe::path_with_suffix_extension; @@ -17,10 +19,8 @@ use utils::logging::SecretString; use once_cell::sync::OnceCell; use reqwest::Url; use std::num::NonZeroUsize; -use std::str::FromStr; use std::sync::Arc; use std::time::Duration; -use toml_edit::{Document, Item}; use camino::{Utf8Path, Utf8PathBuf}; use postgres_backend::AuthType; @@ -29,136 +29,27 @@ use utils::{ logging::LogFormat, }; -use crate::l0_flush::L0FlushConfig; -use crate::tenant::config::TenantConfOpt; -use crate::tenant::timeline::compaction::CompactL0Phase1ValueAccess; -use crate::tenant::vectored_blob_io::MaxVectoredReadBytes; +use crate::tenant::storage_layer::inmemory_layer::IndexEntry; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; -use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine}; -use crate::{tenant::config::TenantConf, virtual_file}; +use crate::virtual_file; +use crate::virtual_file::io_engine; use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX}; -use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP; - -use self::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE; - -pub mod defaults { - use crate::tenant::config::defaults::*; - use const_format::formatcp; - - pub use pageserver_api::config::{ - DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, - DEFAULT_PG_LISTEN_PORT, - }; - pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; - - pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "300 s"; - pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s"; - - pub const DEFAULT_SUPERUSER: &str = "cloud_admin"; - - pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192; - pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100; - - pub const DEFAULT_LOG_FORMAT: &str = "plain"; - - pub const DEFAULT_CONCURRENT_TENANT_WARMUP: usize = 8; - - pub const DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES: usize = - super::ConfigurableSemaphore::DEFAULT_INITIAL.get(); - - pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min"; - pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option = None; - pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min"; - pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s"; - - pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8; - pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1; - - pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; - - #[cfg(target_os = "linux")] - pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "tokio-epoll-uring"; - - #[cfg(not(target_os = "linux"))] - pub const DEFAULT_VIRTUAL_FILE_IO_ENGINE: &str = "std-fs"; - - pub const DEFAULT_GET_VECTORED_IMPL: &str = "vectored"; - - pub const DEFAULT_GET_IMPL: &str = "vectored"; - - pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB - - pub const DEFAULT_IMAGE_COMPRESSION: &str = "zstd(1)"; - - pub const DEFAULT_VALIDATE_VECTORED_GET: bool = false; - - pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; - - /// - /// Default built-in configuration file. - /// - pub const DEFAULT_CONFIG_FILE: &str = formatcp!( - r#" -# Initial configuration file created by 'pageserver --init' -#listen_pg_addr = '{DEFAULT_PG_LISTEN_ADDR}' -#listen_http_addr = '{DEFAULT_HTTP_LISTEN_ADDR}' - -#wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}' -#wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}' - -#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE} -#max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS} - -# initial superuser role name to use when creating a new tenant -#initial_superuser_name = '{DEFAULT_SUPERUSER}' - -#broker_endpoint = '{BROKER_DEFAULT_ENDPOINT}' - -#log_format = '{DEFAULT_LOG_FORMAT}' - -#concurrent_tenant_size_logical_size_queries = '{DEFAULT_CONCURRENT_TENANT_SIZE_LOGICAL_SIZE_QUERIES}' -#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}' - -#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}' -#synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}' - -#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}} - -#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}' - -#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE} - -#virtual_file_io_engine = '{DEFAULT_VIRTUAL_FILE_IO_ENGINE}' - -#max_vectored_read_bytes = '{DEFAULT_MAX_VECTORED_READ_BYTES}' - -[tenant_config] -#checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes -#checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT} -#compaction_target_size = {DEFAULT_COMPACTION_TARGET_SIZE} # in bytes -#compaction_period = '{DEFAULT_COMPACTION_PERIOD}' -#compaction_threshold = {DEFAULT_COMPACTION_THRESHOLD} - -#gc_period = '{DEFAULT_GC_PERIOD}' -#gc_horizon = {DEFAULT_GC_HORIZON} -#image_creation_threshold = {DEFAULT_IMAGE_CREATION_THRESHOLD} -#pitr_interval = '{DEFAULT_PITR_INTERVAL}' - -#min_resident_size_override = .. # in bytes -#evictions_low_residence_duration_metric_threshold = '{DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD}' - -#heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY} -#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY} - -#ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB} - -#[remote_storage] - -"# - ); -} - +/// Global state of pageserver. +/// +/// It's mostly immutable configuration, but some semaphores and the +/// like crept in over time and the name stuck. +/// +/// Instantiated by deserializing `pageserver.toml` into [`pageserver_api::config::ConfigToml`] +/// and passing that to [`PageServerConf::parse_and_validate`]. +/// +/// # Adding a New Field +/// +/// 1. Add the field to `pageserver_api::config::ConfigToml`. +/// 2. Fix compiler errors (exhaustive destructuring will guide you). +/// +/// For fields that require additional validation or filling in of defaults at runtime, +/// check for examples in the [`PageServerConf::parse_and_validate`] method. #[derive(Debug, Clone, PartialEq, Eq)] pub struct PageServerConf { // Identifier of that particular pageserver so e g safekeepers @@ -204,7 +95,7 @@ pub struct PageServerConf { pub remote_storage_config: Option, - pub default_tenant_conf: TenantConf, + pub default_tenant_conf: crate::tenant::config::TenantConf, /// Storage broker endpoints to connect to. pub broker_endpoint: Uri, @@ -281,14 +172,16 @@ pub struct PageServerConf { /// Setting this to zero disables limits on total ephemeral layer size. pub ephemeral_bytes_per_memory_kb: usize, - pub l0_flush: L0FlushConfig, + pub l0_flush: crate::l0_flush::L0FlushConfig, /// This flag is temporary and will be removed after gradual rollout. /// See . - pub compact_level0_phase1_value_access: CompactL0Phase1ValueAccess, + pub compact_level0_phase1_value_access: pageserver_api::config::CompactL0Phase1ValueAccess, /// Direct IO settings pub virtual_file_direct_io: virtual_file::DirectIoMode, + + pub io_buffer_alignment: usize, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -299,464 +192,6 @@ pub struct PageServerConf { /// startup code to the connection code through a dozen layers. pub static SAFEKEEPER_AUTH_TOKEN: OnceCell> = OnceCell::new(); -// use dedicated enum for builder to better indicate the intention -// and avoid possible confusion with nested options -#[derive(Clone, Default)] -pub enum BuilderValue { - Set(T), - #[default] - NotSet, -} - -impl BuilderValue { - pub fn ok_or(&self, field_name: &'static str, default: BuilderValue) -> anyhow::Result { - match self { - Self::Set(v) => Ok(v.clone()), - Self::NotSet => match default { - BuilderValue::Set(v) => Ok(v.clone()), - BuilderValue::NotSet => { - anyhow::bail!("missing config value {field_name:?}") - } - }, - } - } -} - -// needed to simplify config construction -#[derive(Default)] -struct PageServerConfigBuilder { - listen_pg_addr: BuilderValue, - - listen_http_addr: BuilderValue, - - availability_zone: BuilderValue>, - - wait_lsn_timeout: BuilderValue, - wal_redo_timeout: BuilderValue, - - superuser: BuilderValue, - - page_cache_size: BuilderValue, - max_file_descriptors: BuilderValue, - - workdir: BuilderValue, - - pg_distrib_dir: BuilderValue, - - http_auth_type: BuilderValue, - pg_auth_type: BuilderValue, - - // - auth_validation_public_key_path: BuilderValue>, - remote_storage_config: BuilderValue>, - - broker_endpoint: BuilderValue, - broker_keepalive_interval: BuilderValue, - - log_format: BuilderValue, - - concurrent_tenant_warmup: BuilderValue, - concurrent_tenant_size_logical_size_queries: BuilderValue, - - metric_collection_interval: BuilderValue, - metric_collection_endpoint: BuilderValue>, - synthetic_size_calculation_interval: BuilderValue, - metric_collection_bucket: BuilderValue>, - - disk_usage_based_eviction: BuilderValue>, - - test_remote_failures: BuilderValue, - - ondemand_download_behavior_treat_error_as_warn: BuilderValue, - - background_task_maximum_delay: BuilderValue, - - control_plane_api: BuilderValue>, - control_plane_api_token: BuilderValue>, - control_plane_emergency_mode: BuilderValue, - - heatmap_upload_concurrency: BuilderValue, - secondary_download_concurrency: BuilderValue, - - ingest_batch_size: BuilderValue, - - virtual_file_io_engine: BuilderValue, - - max_vectored_read_bytes: BuilderValue, - - image_compression: BuilderValue, - - ephemeral_bytes_per_memory_kb: BuilderValue, - - l0_flush: BuilderValue, - - compact_level0_phase1_value_access: BuilderValue, - - virtual_file_direct_io: BuilderValue, -} - -impl PageServerConfigBuilder { - fn new() -> Self { - Self::default() - } - - #[inline(always)] - fn default_values() -> Self { - use self::BuilderValue::*; - use defaults::*; - Self { - listen_pg_addr: Set(DEFAULT_PG_LISTEN_ADDR.to_string()), - listen_http_addr: Set(DEFAULT_HTTP_LISTEN_ADDR.to_string()), - availability_zone: Set(None), - wait_lsn_timeout: Set(humantime::parse_duration(DEFAULT_WAIT_LSN_TIMEOUT) - .expect("cannot parse default wait lsn timeout")), - wal_redo_timeout: Set(humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT) - .expect("cannot parse default wal redo timeout")), - superuser: Set(DEFAULT_SUPERUSER.to_string()), - page_cache_size: Set(DEFAULT_PAGE_CACHE_SIZE), - max_file_descriptors: Set(DEFAULT_MAX_FILE_DESCRIPTORS), - workdir: Set(Utf8PathBuf::new()), - pg_distrib_dir: Set(Utf8PathBuf::from_path_buf( - env::current_dir().expect("cannot access current directory"), - ) - .expect("non-Unicode path") - .join("pg_install")), - http_auth_type: Set(AuthType::Trust), - pg_auth_type: Set(AuthType::Trust), - auth_validation_public_key_path: Set(None), - remote_storage_config: Set(None), - broker_endpoint: Set(storage_broker::DEFAULT_ENDPOINT - .parse() - .expect("failed to parse default broker endpoint")), - broker_keepalive_interval: Set(humantime::parse_duration( - storage_broker::DEFAULT_KEEPALIVE_INTERVAL, - ) - .expect("cannot parse default keepalive interval")), - log_format: Set(LogFormat::from_str(DEFAULT_LOG_FORMAT).unwrap()), - - concurrent_tenant_warmup: Set(NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP) - .expect("Invalid default constant")), - concurrent_tenant_size_logical_size_queries: Set( - ConfigurableSemaphore::DEFAULT_INITIAL, - ), - metric_collection_interval: Set(humantime::parse_duration( - DEFAULT_METRIC_COLLECTION_INTERVAL, - ) - .expect("cannot parse default metric collection interval")), - synthetic_size_calculation_interval: Set(humantime::parse_duration( - DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL, - ) - .expect("cannot parse default synthetic size calculation interval")), - metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT), - - metric_collection_bucket: Set(None), - - disk_usage_based_eviction: Set(None), - - test_remote_failures: Set(0), - - ondemand_download_behavior_treat_error_as_warn: Set(false), - - background_task_maximum_delay: Set(humantime::parse_duration( - DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY, - ) - .unwrap()), - - control_plane_api: Set(None), - control_plane_api_token: Set(None), - control_plane_emergency_mode: Set(false), - - heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY), - secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY), - - ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE), - - virtual_file_io_engine: Set(DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap()), - - max_vectored_read_bytes: Set(MaxVectoredReadBytes( - NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), - )), - image_compression: Set(DEFAULT_IMAGE_COMPRESSION.parse().unwrap()), - ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), - l0_flush: Set(L0FlushConfig::default()), - compact_level0_phase1_value_access: Set(CompactL0Phase1ValueAccess::default()), - virtual_file_direct_io: Set(virtual_file::DirectIoMode::default()), - } - } -} - -impl PageServerConfigBuilder { - pub fn listen_pg_addr(&mut self, listen_pg_addr: String) { - self.listen_pg_addr = BuilderValue::Set(listen_pg_addr) - } - - pub fn listen_http_addr(&mut self, listen_http_addr: String) { - self.listen_http_addr = BuilderValue::Set(listen_http_addr) - } - - pub fn availability_zone(&mut self, availability_zone: Option) { - self.availability_zone = BuilderValue::Set(availability_zone) - } - - pub fn wait_lsn_timeout(&mut self, wait_lsn_timeout: Duration) { - self.wait_lsn_timeout = BuilderValue::Set(wait_lsn_timeout) - } - - pub fn wal_redo_timeout(&mut self, wal_redo_timeout: Duration) { - self.wal_redo_timeout = BuilderValue::Set(wal_redo_timeout) - } - - pub fn superuser(&mut self, superuser: String) { - self.superuser = BuilderValue::Set(superuser) - } - - pub fn page_cache_size(&mut self, page_cache_size: usize) { - self.page_cache_size = BuilderValue::Set(page_cache_size) - } - - pub fn max_file_descriptors(&mut self, max_file_descriptors: usize) { - self.max_file_descriptors = BuilderValue::Set(max_file_descriptors) - } - - pub fn workdir(&mut self, workdir: Utf8PathBuf) { - self.workdir = BuilderValue::Set(workdir) - } - - pub fn pg_distrib_dir(&mut self, pg_distrib_dir: Utf8PathBuf) { - self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir) - } - - pub fn http_auth_type(&mut self, auth_type: AuthType) { - self.http_auth_type = BuilderValue::Set(auth_type) - } - - pub fn pg_auth_type(&mut self, auth_type: AuthType) { - self.pg_auth_type = BuilderValue::Set(auth_type) - } - - pub fn auth_validation_public_key_path( - &mut self, - auth_validation_public_key_path: Option, - ) { - self.auth_validation_public_key_path = BuilderValue::Set(auth_validation_public_key_path) - } - - pub fn remote_storage_config(&mut self, remote_storage_config: Option) { - self.remote_storage_config = BuilderValue::Set(remote_storage_config) - } - - pub fn broker_endpoint(&mut self, broker_endpoint: Uri) { - self.broker_endpoint = BuilderValue::Set(broker_endpoint) - } - - pub fn broker_keepalive_interval(&mut self, broker_keepalive_interval: Duration) { - self.broker_keepalive_interval = BuilderValue::Set(broker_keepalive_interval) - } - - pub fn log_format(&mut self, log_format: LogFormat) { - self.log_format = BuilderValue::Set(log_format) - } - - pub fn concurrent_tenant_warmup(&mut self, u: NonZeroUsize) { - self.concurrent_tenant_warmup = BuilderValue::Set(u); - } - - pub fn concurrent_tenant_size_logical_size_queries(&mut self, u: NonZeroUsize) { - self.concurrent_tenant_size_logical_size_queries = BuilderValue::Set(u); - } - - pub fn metric_collection_interval(&mut self, metric_collection_interval: Duration) { - self.metric_collection_interval = BuilderValue::Set(metric_collection_interval) - } - - pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option) { - self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint) - } - - pub fn metric_collection_bucket( - &mut self, - metric_collection_bucket: Option, - ) { - self.metric_collection_bucket = BuilderValue::Set(metric_collection_bucket) - } - - pub fn synthetic_size_calculation_interval( - &mut self, - synthetic_size_calculation_interval: Duration, - ) { - self.synthetic_size_calculation_interval = - BuilderValue::Set(synthetic_size_calculation_interval) - } - - pub fn test_remote_failures(&mut self, fail_first: u64) { - self.test_remote_failures = BuilderValue::Set(fail_first); - } - - pub fn disk_usage_based_eviction(&mut self, value: Option) { - self.disk_usage_based_eviction = BuilderValue::Set(value); - } - - pub fn ondemand_download_behavior_treat_error_as_warn( - &mut self, - ondemand_download_behavior_treat_error_as_warn: bool, - ) { - self.ondemand_download_behavior_treat_error_as_warn = - BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn); - } - - pub fn background_task_maximum_delay(&mut self, delay: Duration) { - self.background_task_maximum_delay = BuilderValue::Set(delay); - } - - pub fn control_plane_api(&mut self, api: Option) { - self.control_plane_api = BuilderValue::Set(api) - } - - pub fn control_plane_api_token(&mut self, token: Option) { - self.control_plane_api_token = BuilderValue::Set(token) - } - - pub fn control_plane_emergency_mode(&mut self, enabled: bool) { - self.control_plane_emergency_mode = BuilderValue::Set(enabled) - } - - pub fn heatmap_upload_concurrency(&mut self, value: usize) { - self.heatmap_upload_concurrency = BuilderValue::Set(value) - } - - pub fn secondary_download_concurrency(&mut self, value: usize) { - self.secondary_download_concurrency = BuilderValue::Set(value) - } - - pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) { - self.ingest_batch_size = BuilderValue::Set(ingest_batch_size) - } - - pub fn virtual_file_io_engine(&mut self, value: virtual_file::IoEngineKind) { - self.virtual_file_io_engine = BuilderValue::Set(value); - } - - pub fn get_max_vectored_read_bytes(&mut self, value: MaxVectoredReadBytes) { - self.max_vectored_read_bytes = BuilderValue::Set(value); - } - - pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) { - self.image_compression = BuilderValue::Set(value); - } - - pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) { - self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value); - } - - pub fn l0_flush(&mut self, value: L0FlushConfig) { - self.l0_flush = BuilderValue::Set(value); - } - - pub fn compact_level0_phase1_value_access(&mut self, value: CompactL0Phase1ValueAccess) { - self.compact_level0_phase1_value_access = BuilderValue::Set(value); - } - - pub fn virtual_file_direct_io(&mut self, value: virtual_file::DirectIoMode) { - self.virtual_file_direct_io = BuilderValue::Set(value); - } - - pub fn build(self, id: NodeId) -> anyhow::Result { - let default = Self::default_values(); - - macro_rules! conf { - (USING DEFAULT { $($field:ident,)* } CUSTOM LOGIC { $($custom_field:ident : $custom_value:expr,)* } ) => { - PageServerConf { - $( - $field: self.$field.ok_or(stringify!($field), default.$field)?, - )* - $( - $custom_field: $custom_value, - )* - } - }; - } - - Ok(conf!( - USING DEFAULT - { - listen_pg_addr, - listen_http_addr, - availability_zone, - wait_lsn_timeout, - wal_redo_timeout, - superuser, - page_cache_size, - max_file_descriptors, - workdir, - pg_distrib_dir, - http_auth_type, - pg_auth_type, - auth_validation_public_key_path, - remote_storage_config, - broker_endpoint, - broker_keepalive_interval, - log_format, - metric_collection_interval, - metric_collection_endpoint, - metric_collection_bucket, - synthetic_size_calculation_interval, - disk_usage_based_eviction, - test_remote_failures, - ondemand_download_behavior_treat_error_as_warn, - background_task_maximum_delay, - control_plane_api, - control_plane_api_token, - control_plane_emergency_mode, - heatmap_upload_concurrency, - secondary_download_concurrency, - ingest_batch_size, - max_vectored_read_bytes, - image_compression, - ephemeral_bytes_per_memory_kb, - l0_flush, - compact_level0_phase1_value_access, - virtual_file_direct_io, - } - CUSTOM LOGIC - { - id: id, - // TenantConf is handled separately - default_tenant_conf: TenantConf::default(), - concurrent_tenant_warmup: ConfigurableSemaphore::new({ - self - .concurrent_tenant_warmup - .ok_or("concurrent_tenant_warmpup", - default.concurrent_tenant_warmup)? - }), - concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new( - self - .concurrent_tenant_size_logical_size_queries - .ok_or("concurrent_tenant_size_logical_size_queries", - default.concurrent_tenant_size_logical_size_queries.clone())? - ), - eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new( - // re-use `concurrent_tenant_size_logical_size_queries` - self - .concurrent_tenant_size_logical_size_queries - .ok_or("eviction_task_immitated_concurrent_logical_size_queries", - default.concurrent_tenant_size_logical_size_queries.clone())?, - ), - virtual_file_io_engine: match self.virtual_file_io_engine { - BuilderValue::Set(v) => v, - BuilderValue::NotSet => match crate::virtual_file::io_engine_feature_test().context("auto-detect virtual_file_io_engine")? { - io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise - io_engine::FeatureTestResult::Worse { engine, remark } => { - // TODO: bubble this up to the caller so we can tracing::warn! it. - eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}"); - engine - } - }, - }, - } - )) - } -} - impl PageServerConf { // // Repository paths, relative to workdir. @@ -865,131 +300,135 @@ impl PageServerConf { /// /// This leaves any options not present in the file in the built-in defaults. pub fn parse_and_validate( - node_id: NodeId, - toml: &Document, + id: NodeId, + config_toml: pageserver_api::config::ConfigToml, workdir: &Utf8Path, ) -> anyhow::Result { - let mut builder = PageServerConfigBuilder::new(); - builder.workdir(workdir.to_owned()); + let pageserver_api::config::ConfigToml { + listen_pg_addr, + listen_http_addr, + availability_zone, + wait_lsn_timeout, + wal_redo_timeout, + superuser, + page_cache_size, + max_file_descriptors, + pg_distrib_dir, + http_auth_type, + pg_auth_type, + auth_validation_public_key_path, + remote_storage, + broker_endpoint, + broker_keepalive_interval, + log_format, + metric_collection_interval, + metric_collection_endpoint, + metric_collection_bucket, + synthetic_size_calculation_interval, + disk_usage_based_eviction, + test_remote_failures, + ondemand_download_behavior_treat_error_as_warn, + background_task_maximum_delay, + control_plane_api, + control_plane_api_token, + control_plane_emergency_mode, + heatmap_upload_concurrency, + secondary_download_concurrency, + ingest_batch_size, + max_vectored_read_bytes, + image_compression, + ephemeral_bytes_per_memory_kb, + compact_level0_phase1_value_access, + l0_flush, + virtual_file_direct_io, + concurrent_tenant_warmup, + concurrent_tenant_size_logical_size_queries, + virtual_file_io_engine, + io_buffer_alignment, + tenant_config, + } = config_toml; - let mut t_conf = TenantConfOpt::default(); + let mut conf = PageServerConf { + // ------------------------------------------------------------ + // fields that are already fully validated by the ConfigToml Deserialize impl + // ------------------------------------------------------------ + listen_pg_addr, + listen_http_addr, + availability_zone, + wait_lsn_timeout, + wal_redo_timeout, + superuser, + page_cache_size, + max_file_descriptors, + http_auth_type, + pg_auth_type, + auth_validation_public_key_path, + remote_storage_config: remote_storage, + broker_endpoint, + broker_keepalive_interval, + log_format, + metric_collection_interval, + metric_collection_endpoint, + metric_collection_bucket, + synthetic_size_calculation_interval, + disk_usage_based_eviction, + test_remote_failures, + ondemand_download_behavior_treat_error_as_warn, + background_task_maximum_delay, + control_plane_api, + control_plane_emergency_mode, + heatmap_upload_concurrency, + secondary_download_concurrency, + ingest_batch_size, + max_vectored_read_bytes, + image_compression, + ephemeral_bytes_per_memory_kb, + compact_level0_phase1_value_access, + virtual_file_direct_io, + io_buffer_alignment, - for (key, item) in toml.iter() { - match key { - "listen_pg_addr" => builder.listen_pg_addr(parse_toml_string(key, item)?), - "listen_http_addr" => builder.listen_http_addr(parse_toml_string(key, item)?), - "availability_zone" => builder.availability_zone(Some(parse_toml_string(key, item)?)), - "wait_lsn_timeout" => builder.wait_lsn_timeout(parse_toml_duration(key, item)?), - "wal_redo_timeout" => builder.wal_redo_timeout(parse_toml_duration(key, item)?), - "initial_superuser_name" => builder.superuser(parse_toml_string(key, item)?), - "page_cache_size" => builder.page_cache_size(parse_toml_u64(key, item)? as usize), - "max_file_descriptors" => { - builder.max_file_descriptors(parse_toml_u64(key, item)? as usize) - } - "pg_distrib_dir" => { - builder.pg_distrib_dir(Utf8PathBuf::from(parse_toml_string(key, item)?)) - } - "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some( - Utf8PathBuf::from(parse_toml_string(key, item)?), - )), - "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?), - "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?), - "remote_storage" => { - builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item).context("remote_storage")?)) - } - "tenant_config" => { - t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?; - } - "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?), - "broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?), - "log_format" => builder.log_format( - LogFormat::from_config(&parse_toml_string(key, item)?)? - ), - "concurrent_tenant_warmup" => builder.concurrent_tenant_warmup({ - let input = parse_toml_string(key, item)?; - let permits = input.parse::().context("expected a number of initial permits, not {s:?}")?; - NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")? - }), - "concurrent_tenant_size_logical_size_queries" => builder.concurrent_tenant_size_logical_size_queries({ - let input = parse_toml_string(key, item)?; - let permits = input.parse::().context("expected a number of initial permits, not {s:?}")?; - NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")? - }), - "metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?), - "metric_collection_endpoint" => { - let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?; - builder.metric_collection_endpoint(Some(endpoint)); - }, - "metric_collection_bucket" => { - builder.metric_collection_bucket(Some(RemoteStorageConfig::from_toml(item)?)) - } - "synthetic_size_calculation_interval" => - builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), - "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?), - "disk_usage_based_eviction" => { - tracing::info!("disk_usage_based_eviction: {:#?}", &item); - builder.disk_usage_based_eviction( - deserialize_from_item("disk_usage_based_eviction", item) - .context("parse disk_usage_based_eviction")? - ) - }, - "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?), - "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?), - "control_plane_api" => { - let parsed = parse_toml_string(key, item)?; - if parsed.is_empty() { - builder.control_plane_api(None) - } else { - builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?)) + // ------------------------------------------------------------ + // fields that require additional validation or custom handling + // ------------------------------------------------------------ + workdir: workdir.to_owned(), + pg_distrib_dir: pg_distrib_dir.unwrap_or_else(|| { + std::env::current_dir() + .expect("current_dir() failed") + .try_into() + .expect("current_dir() is not a valid Utf8Path") + }), + control_plane_api_token: control_plane_api_token.map(SecretString::from), + id, + default_tenant_conf: tenant_config, + concurrent_tenant_warmup: ConfigurableSemaphore::new(concurrent_tenant_warmup), + concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::new( + concurrent_tenant_size_logical_size_queries, + ), + eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::new( + // re-use `concurrent_tenant_size_logical_size_queries` + concurrent_tenant_size_logical_size_queries, + ), + virtual_file_io_engine: match virtual_file_io_engine { + Some(v) => v, + None => match crate::virtual_file::io_engine_feature_test() + .context("auto-detect virtual_file_io_engine")? + { + io_engine::FeatureTestResult::PlatformPreferred(v) => v, // make no noise + io_engine::FeatureTestResult::Worse { engine, remark } => { + // TODO: bubble this up to the caller so we can tracing::warn! it. + eprintln!("auto-detected IO engine is not platform-preferred: engine={engine:?} remark={remark:?}"); + engine } }, - "control_plane_api_token" => { - let parsed = parse_toml_string(key, item)?; - if parsed.is_empty() { - builder.control_plane_api_token(None) - } else { - builder.control_plane_api_token(Some(parsed.into())) - } - }, - "control_plane_emergency_mode" => { - builder.control_plane_emergency_mode(parse_toml_bool(key, item)?) - }, - "heatmap_upload_concurrency" => { - builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize) - }, - "secondary_download_concurrency" => { - builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize) - }, - "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?), - "virtual_file_io_engine" => { - builder.virtual_file_io_engine(parse_toml_from_str("virtual_file_io_engine", item)?) - } - "max_vectored_read_bytes" => { - let bytes = parse_toml_u64("max_vectored_read_bytes", item)? as usize; - builder.get_max_vectored_read_bytes( - MaxVectoredReadBytes( - NonZeroUsize::new(bytes).expect("Max byte size of vectored read must be greater than 0"))) - } - "image_compression" => { - builder.get_image_compression(parse_toml_from_str("image_compression", item)?) - } - "ephemeral_bytes_per_memory_kb" => { - builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize) - } - "l0_flush" => { - builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?) - } - "compact_level0_phase1_value_access" => { - builder.compact_level0_phase1_value_access(utils::toml_edit_ext::deserialize_item(item).context("compact_level0_phase1_value_access")?) - } - "virtual_file_direct_io" => { - builder.virtual_file_direct_io(utils::toml_edit_ext::deserialize_item(item).context("virtual_file_direct_io")?) - } - _ => bail!("unrecognized pageserver option '{key}'"), - } - } + }, + l0_flush: l0_flush + .map(crate::l0_flush::L0FlushConfig::from) + .unwrap_or_default(), + }; - let mut conf = builder.build(node_id).context("invalid config")?; + // ------------------------------------------------------------ + // custom validation code that covers more than one field in isolation + // ------------------------------------------------------------ if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT { let auth_validation_public_key_path = conf @@ -1003,7 +442,14 @@ impl PageServerConf { ); } - conf.default_tenant_conf = t_conf.merge(TenantConf::default()); + IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance) + .map_err(anyhow::Error::msg) + .with_context(|| { + format!( + "effective checkpoint distance is unsupported: {}", + conf.default_tenant_conf.checkpoint_distance + ) + })?; Ok(conf) } @@ -1017,129 +463,25 @@ impl PageServerConf { pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self { let pg_distrib_dir = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install"); - PageServerConf { - id: NodeId(0), + let config_toml = pageserver_api::config::ConfigToml { wait_lsn_timeout: Duration::from_secs(60), wal_redo_timeout: Duration::from_secs(60), - page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE, - max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS, - listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), - listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), - availability_zone: None, - superuser: "cloud_admin".to_string(), - workdir: repo_dir, - pg_distrib_dir, - http_auth_type: AuthType::Trust, - pg_auth_type: AuthType::Trust, - auth_validation_public_key_path: None, - remote_storage_config: None, - default_tenant_conf: TenantConf::default(), - broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), - broker_keepalive_interval: Duration::from_secs(5000), - log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), - concurrent_tenant_warmup: ConfigurableSemaphore::new( - NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP) - .expect("Invalid default constant"), - ), - concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), - eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default( - ), + pg_distrib_dir: Some(pg_distrib_dir), metric_collection_interval: Duration::from_secs(60), - metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, - metric_collection_bucket: None, synthetic_size_calculation_interval: Duration::from_secs(60), - disk_usage_based_eviction: None, - test_remote_failures: 0, - ondemand_download_behavior_treat_error_as_warn: false, background_task_maximum_delay: Duration::ZERO, - control_plane_api: None, - control_plane_api_token: None, - control_plane_emergency_mode: false, - heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, - secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, - ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, - virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), - max_vectored_read_bytes: MaxVectoredReadBytes( - NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) - .expect("Invalid default constant"), - ), - image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(), - ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, - l0_flush: L0FlushConfig::default(), - compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), - virtual_file_direct_io: virtual_file::DirectIoMode::default(), - } + ..Default::default() + }; + PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap() } } -#[derive(Deserialize)] +#[derive(serde::Deserialize, serde::Serialize)] #[serde(deny_unknown_fields)] pub struct PageserverIdentity { pub id: NodeId, } -// Helper functions to parse a toml Item - -fn parse_toml_string(name: &str, item: &Item) -> Result { - let s = item - .as_str() - .with_context(|| format!("configure option {name} is not a string"))?; - Ok(s.to_string()) -} - -fn parse_toml_u64(name: &str, item: &Item) -> Result { - // A toml integer is signed, so it cannot represent the full range of an u64. That's OK - // for our use, though. - let i: i64 = item - .as_integer() - .with_context(|| format!("configure option {name} is not an integer"))?; - if i < 0 { - bail!("configure option {name} cannot be negative"); - } - Ok(i as u64) -} - -fn parse_toml_bool(name: &str, item: &Item) -> Result { - item.as_bool() - .with_context(|| format!("configure option {name} is not a bool")) -} - -fn parse_toml_duration(name: &str, item: &Item) -> Result { - let s = item - .as_str() - .with_context(|| format!("configure option {name} is not a string"))?; - - Ok(humantime::parse_duration(s)?) -} - -fn parse_toml_from_str(name: &str, item: &Item) -> anyhow::Result -where - T: FromStr, - ::Err: std::fmt::Display, -{ - let v = item - .as_str() - .with_context(|| format!("configure option {name} is not a string"))?; - T::from_str(v).map_err(|e| { - anyhow!( - "Failed to parse string as {parse_type} for configure option {name}: {e}", - parse_type = stringify!(T) - ) - }) -} - -fn deserialize_from_item(name: &str, item: &Item) -> anyhow::Result -where - T: serde::de::DeserializeOwned, -{ - // ValueDeserializer::new is not public, so use the ValueDeserializer's documented way - let deserializer = match item.clone().into_value() { - Ok(value) => value.into_deserializer(), - Err(item) => anyhow::bail!("toml_edit::Item '{item}' is not a toml_edit::Value"), - }; - T::deserialize(deserializer).with_context(|| format!("deserializing item for node {name}")) -} - /// Configurable semaphore permits setting. /// /// Does not allow semaphore permits to be zero, because at runtime initially zero permits and empty @@ -1201,467 +543,108 @@ impl ConfigurableSemaphore { #[cfg(test)] mod tests { - use std::{fs, num::NonZeroU32}; - use camino_tempfile::{tempdir, Utf8TempDir}; - use pageserver_api::models::EvictionPolicy; - use remote_storage::{RemoteStorageKind, S3Config}; - use utils::serde_percent::Percent; + use camino::Utf8PathBuf; + use utils::id::NodeId; - use super::*; - use crate::DEFAULT_PG_VERSION; - - const ALL_BASE_VALUES_TOML: &str = r#" -# Initial configuration file created by 'pageserver --init' - -listen_pg_addr = '127.0.0.1:64000' -listen_http_addr = '127.0.0.1:9898' - -wait_lsn_timeout = '111 s' -wal_redo_timeout = '111 s' - -page_cache_size = 444 -max_file_descriptors = 333 - -# initial superuser role name to use when creating a new tenant -initial_superuser_name = 'zzzz' - -metric_collection_interval = '222 s' -metric_collection_endpoint = 'http://localhost:80/metrics' -synthetic_size_calculation_interval = '333 s' - -log_format = 'json' -background_task_maximum_delay = '334 s' - -"#; + use super::PageServerConf; #[test] - fn parse_defaults() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - let broker_endpoint = storage_broker::DEFAULT_ENDPOINT; - // we have to create dummy values to overcome the validation errors - let config_string = - format!("pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",); - let toml = config_string.parse()?; - - let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) - .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); - - assert_eq!( - parsed_config, - PageServerConf { - id: NodeId(10), - listen_pg_addr: defaults::DEFAULT_PG_LISTEN_ADDR.to_string(), - listen_http_addr: defaults::DEFAULT_HTTP_LISTEN_ADDR.to_string(), - availability_zone: None, - wait_lsn_timeout: humantime::parse_duration(defaults::DEFAULT_WAIT_LSN_TIMEOUT)?, - wal_redo_timeout: humantime::parse_duration(defaults::DEFAULT_WAL_REDO_TIMEOUT)?, - superuser: defaults::DEFAULT_SUPERUSER.to_string(), - page_cache_size: defaults::DEFAULT_PAGE_CACHE_SIZE, - max_file_descriptors: defaults::DEFAULT_MAX_FILE_DESCRIPTORS, - workdir, - pg_distrib_dir, - http_auth_type: AuthType::Trust, - pg_auth_type: AuthType::Trust, - auth_validation_public_key_path: None, - remote_storage_config: None, - default_tenant_conf: TenantConf::default(), - broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), - broker_keepalive_interval: humantime::parse_duration( - storage_broker::DEFAULT_KEEPALIVE_INTERVAL - )?, - log_format: LogFormat::from_str(defaults::DEFAULT_LOG_FORMAT).unwrap(), - concurrent_tenant_warmup: ConfigurableSemaphore::new( - NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap() - ), - concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), - eviction_task_immitated_concurrent_logical_size_queries: - ConfigurableSemaphore::default(), - metric_collection_interval: humantime::parse_duration( - defaults::DEFAULT_METRIC_COLLECTION_INTERVAL - )?, - metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT, - metric_collection_bucket: None, - synthetic_size_calculation_interval: humantime::parse_duration( - defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL - )?, - disk_usage_based_eviction: None, - test_remote_failures: 0, - ondemand_download_behavior_treat_error_as_warn: false, - background_task_maximum_delay: humantime::parse_duration( - defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY - )?, - control_plane_api: None, - control_plane_api_token: None, - control_plane_emergency_mode: false, - heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, - secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, - ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE, - virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), - max_vectored_read_bytes: MaxVectoredReadBytes( - NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) - .expect("Invalid default constant") - ), - image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(), - ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, - l0_flush: L0FlushConfig::default(), - compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), - virtual_file_direct_io: virtual_file::DirectIoMode::default(), - }, - "Correct defaults should be used when no config values are provided" - ); - - Ok(()) - } - - #[test] - fn parse_basic_config() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - let broker_endpoint = storage_broker::DEFAULT_ENDPOINT; - - let config_string = format!( - "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'", - ); - let toml = config_string.parse()?; - - let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) - .unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}")); - - assert_eq!( - parsed_config, - PageServerConf { - id: NodeId(10), - listen_pg_addr: "127.0.0.1:64000".to_string(), - listen_http_addr: "127.0.0.1:9898".to_string(), - availability_zone: None, - wait_lsn_timeout: Duration::from_secs(111), - wal_redo_timeout: Duration::from_secs(111), - superuser: "zzzz".to_string(), - page_cache_size: 444, - max_file_descriptors: 333, - workdir, - pg_distrib_dir, - http_auth_type: AuthType::Trust, - pg_auth_type: AuthType::Trust, - auth_validation_public_key_path: None, - remote_storage_config: None, - default_tenant_conf: TenantConf::default(), - broker_endpoint: storage_broker::DEFAULT_ENDPOINT.parse().unwrap(), - broker_keepalive_interval: Duration::from_secs(5), - log_format: LogFormat::Json, - concurrent_tenant_warmup: ConfigurableSemaphore::new( - NonZeroUsize::new(DEFAULT_CONCURRENT_TENANT_WARMUP).unwrap() - ), - concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore::default(), - eviction_task_immitated_concurrent_logical_size_queries: - ConfigurableSemaphore::default(), - metric_collection_interval: Duration::from_secs(222), - metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?), - metric_collection_bucket: None, - synthetic_size_calculation_interval: Duration::from_secs(333), - disk_usage_based_eviction: None, - test_remote_failures: 0, - ondemand_download_behavior_treat_error_as_warn: false, - background_task_maximum_delay: Duration::from_secs(334), - control_plane_api: None, - control_plane_api_token: None, - control_plane_emergency_mode: false, - heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY, - secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY, - ingest_batch_size: 100, - virtual_file_io_engine: DEFAULT_VIRTUAL_FILE_IO_ENGINE.parse().unwrap(), - max_vectored_read_bytes: MaxVectoredReadBytes( - NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) - .expect("Invalid default constant") - ), - image_compression: defaults::DEFAULT_IMAGE_COMPRESSION.parse().unwrap(), - ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, - l0_flush: L0FlushConfig::default(), - compact_level0_phase1_value_access: CompactL0Phase1ValueAccess::default(), - virtual_file_direct_io: virtual_file::DirectIoMode::default(), - }, - "Should be able to parse all basic config values correctly" - ); - - Ok(()) - } - - #[test] - fn parse_remote_fs_storage_config() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - let broker_endpoint = "http://127.0.0.1:7777"; - - let local_storage_path = tempdir.path().join("local_remote_storage"); - - let identical_toml_declarations = &[ - format!( - r#"[remote_storage] -local_path = '{local_storage_path}'"#, - ), - format!("remote_storage={{local_path='{local_storage_path}'}}"), - ]; - - for remote_storage_config_str in identical_toml_declarations { - let config_string = format!( - r#"{ALL_BASE_VALUES_TOML} -pg_distrib_dir='{pg_distrib_dir}' -broker_endpoint = '{broker_endpoint}' - -{remote_storage_config_str}"#, - ); - - let toml = config_string.parse()?; - - let parsed_remote_storage_config = - PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) - .unwrap_or_else(|e| { - panic!("Failed to parse config '{config_string}', reason: {e:?}") - }) - .remote_storage_config - .expect("Should have remote storage config for the local FS"); - - assert_eq!( - parsed_remote_storage_config, - RemoteStorageConfig { - storage: RemoteStorageKind::LocalFs { local_path: local_storage_path.clone() }, - timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, - }, - "Remote storage config should correctly parse the local FS config and fill other storage defaults" - ); - } - Ok(()) - } - - #[test] - fn parse_remote_s3_storage_config() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - - let bucket_name = "some-sample-bucket".to_string(); - let bucket_region = "eu-north-1".to_string(); - let prefix_in_bucket = "test_prefix".to_string(); - let endpoint = "http://localhost:5000".to_string(); - let max_concurrent_syncs = NonZeroUsize::new(111).unwrap(); - let max_sync_errors = NonZeroU32::new(222).unwrap(); - let s3_concurrency_limit = NonZeroUsize::new(333).unwrap(); - let broker_endpoint = "http://127.0.0.1:7777"; - - let identical_toml_declarations = &[ - format!( - r#"[remote_storage] -max_concurrent_syncs = {max_concurrent_syncs} -max_sync_errors = {max_sync_errors} -bucket_name = '{bucket_name}' -bucket_region = '{bucket_region}' -prefix_in_bucket = '{prefix_in_bucket}' -endpoint = '{endpoint}' -concurrency_limit = {s3_concurrency_limit}"# - ), - format!( - "remote_storage={{max_concurrent_syncs={max_concurrent_syncs}, max_sync_errors={max_sync_errors}, bucket_name='{bucket_name}',\ - bucket_region='{bucket_region}', prefix_in_bucket='{prefix_in_bucket}', endpoint='{endpoint}', concurrency_limit={s3_concurrency_limit}}}", - ), - ]; - - for remote_storage_config_str in identical_toml_declarations { - let config_string = format!( - r#"{ALL_BASE_VALUES_TOML} -pg_distrib_dir='{pg_distrib_dir}' -broker_endpoint = '{broker_endpoint}' - -{remote_storage_config_str}"#, - ); - - let toml = config_string.parse()?; - - let parsed_remote_storage_config = - PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir) - .unwrap_or_else(|e| { - panic!("Failed to parse config '{config_string}', reason: {e:?}") - }) - .remote_storage_config - .expect("Should have remote storage config for S3"); - - assert_eq!( - parsed_remote_storage_config, - RemoteStorageConfig { - storage: RemoteStorageKind::AwsS3(S3Config { - bucket_name: bucket_name.clone(), - bucket_region: bucket_region.clone(), - prefix_in_bucket: Some(prefix_in_bucket.clone()), - endpoint: Some(endpoint.clone()), - concurrency_limit: s3_concurrency_limit, - max_keys_per_list_response: None, - upload_storage_class: None, - }), - timeout: RemoteStorageConfig::DEFAULT_TIMEOUT, - }, - "Remote storage config should correctly parse the S3 config" - ); - } - Ok(()) - } - - #[test] - fn parse_incorrect_tenant_config() -> anyhow::Result<()> { - let config_string = r#" - [tenant_config] - checkpoint_distance = -1 # supposed to be an u64 - "# - .to_string(); - - let toml: Document = config_string.parse()?; - let item = toml.get("tenant_config").unwrap(); - let error = TenantConfOpt::try_from(item.to_owned()).unwrap_err(); - - let expected_error_str = "checkpoint_distance: invalid value: integer `-1`, expected u64"; - assert_eq!(error.to_string(), expected_error_str); - - Ok(()) - } - - #[test] - fn parse_override_tenant_config() -> anyhow::Result<()> { - let config_string = r#"tenant_config={ min_resident_size_override = 400 }"#.to_string(); - - let toml: Document = config_string.parse()?; - let item = toml.get("tenant_config").unwrap(); - let conf = TenantConfOpt::try_from(item.to_owned()).unwrap(); - - assert_eq!(conf.min_resident_size_override, Some(400)); - - Ok(()) - } - - #[test] - fn eviction_pageserver_config_parse() -> anyhow::Result<()> { - let tempdir = tempdir()?; - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?; - - let pageserver_conf_toml = format!( - r#"pg_distrib_dir = "{pg_distrib_dir}" -metric_collection_endpoint = "http://sample.url" -metric_collection_interval = "10min" - -[disk_usage_based_eviction] -max_usage_pct = 80 -min_avail_bytes = 0 -period = "10s" - -[tenant_config] -evictions_low_residence_duration_metric_threshold = "20m" - -[tenant_config.eviction_policy] -kind = "LayerAccessThreshold" -period = "20m" -threshold = "20m" -"#, - ); - let toml: Document = pageserver_conf_toml.parse()?; - let conf = PageServerConf::parse_and_validate(NodeId(333), &toml, &workdir)?; - - assert_eq!(conf.pg_distrib_dir, pg_distrib_dir); - assert_eq!( - conf.metric_collection_endpoint, - Some("http://sample.url".parse().unwrap()) - ); - assert_eq!( - conf.metric_collection_interval, - Duration::from_secs(10 * 60) - ); - assert_eq!( - conf.default_tenant_conf - .evictions_low_residence_duration_metric_threshold, - Duration::from_secs(20 * 60) - ); - - // Assert that the node id provided by the indentity file (threaded - // through the call to [`PageServerConf::parse_and_validate`] is - // used. - assert_eq!(conf.id, NodeId(333)); - assert_eq!( - conf.disk_usage_based_eviction, - Some(DiskUsageEvictionTaskConfig { - max_usage_pct: Percent::new(80).unwrap(), - min_avail_bytes: 0, - period: Duration::from_secs(10), - #[cfg(feature = "testing")] - mock_statvfs: None, - eviction_order: Default::default(), - }) - ); - - match &conf.default_tenant_conf.eviction_policy { - EvictionPolicy::LayerAccessThreshold(eviction_threshold) => { - assert_eq!(eviction_threshold.period, Duration::from_secs(20 * 60)); - assert_eq!(eviction_threshold.threshold, Duration::from_secs(20 * 60)); - } - other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"), - } - - Ok(()) - } - - #[test] - fn parse_imitation_only_pageserver_config() { - let tempdir = tempdir().unwrap(); - let (workdir, pg_distrib_dir) = prepare_fs(&tempdir).unwrap(); - - let pageserver_conf_toml = format!( - r#"pg_distrib_dir = "{pg_distrib_dir}" -metric_collection_endpoint = "http://sample.url" -metric_collection_interval = "10min" - -[tenant_config] -evictions_low_residence_duration_metric_threshold = "20m" - -[tenant_config.eviction_policy] -kind = "OnlyImitiate" -period = "20m" -threshold = "20m" -"#, - ); - let toml: Document = pageserver_conf_toml.parse().unwrap(); - let conf = PageServerConf::parse_and_validate(NodeId(222), &toml, &workdir).unwrap(); - - match &conf.default_tenant_conf.eviction_policy { - EvictionPolicy::OnlyImitiate(t) => { - assert_eq!(t.period, Duration::from_secs(20 * 60)); - assert_eq!(t.threshold, Duration::from_secs(20 * 60)); - } - other => unreachable!("Unexpected eviction policy tenant settings: {other:?}"), - } - } - - #[test] - fn empty_remote_storage_is_error() { - let tempdir = tempdir().unwrap(); - let (workdir, _) = prepare_fs(&tempdir).unwrap(); + fn test_empty_config_toml_is_valid() { + // we use Default impl of everything in this situation let input = r#" -remote_storage = {} "#; - let doc = toml_edit::Document::from_str(input).unwrap(); - let err = PageServerConf::parse_and_validate(NodeId(222), &doc, &workdir) - .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage"); - assert!(format!("{err}").contains("remote_storage"), "{err}"); + let config_toml = toml_edit::de::from_str::(input) + .expect("empty config is valid"); + let workdir = Utf8PathBuf::from("/nonexistent"); + PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir) + .expect("parse_and_validate"); } - fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> { - let tempdir_path = tempdir.path(); + /// If there's a typo in the pageserver config, we'd rather catch that typo + /// and fail pageserver startup than silently ignoring the typo, leaving whoever + /// made it in the believe that their config change is effective. + /// + /// The default in serde is to allow unknown fields, so, we rely + /// on developer+review discipline to add `deny_unknown_fields` when adding + /// new structs to the config, and these tests here as a regression test. + /// + /// The alternative to all of this would be to allow unknown fields in the config. + /// To catch them, we could have a config check tool or mgmt API endpoint that + /// compares the effective config with the TOML on disk and makes sure that + /// the on-disk TOML is a strict subset of the effective config. + mod unknown_fields_handling { + macro_rules! test { + ($short_name:ident, $input:expr) => { + #[test] + fn $short_name() { + let input = $input; + let err = toml_edit::de::from_str::(&input) + .expect_err("some_invalid_field is an invalid field"); + dbg!(&err); + assert!(err.to_string().contains("some_invalid_field")); + } + }; + } + use indoc::indoc; - let workdir = tempdir_path.join("workdir"); - fs::create_dir_all(&workdir)?; + test!( + toplevel, + indoc! {r#" + some_invalid_field = 23 + "#} + ); - let pg_distrib_dir = tempdir_path.join("pg_distrib"); - let pg_distrib_dir_versioned = pg_distrib_dir.join(format!("v{DEFAULT_PG_VERSION}")); - fs::create_dir_all(&pg_distrib_dir_versioned)?; - let postgres_bin_dir = pg_distrib_dir_versioned.join("bin"); - fs::create_dir_all(&postgres_bin_dir)?; - fs::write(postgres_bin_dir.join("postgres"), "I'm postgres, trust me")?; + test!( + toplevel_nested, + indoc! {r#" + [some_invalid_field] + foo = 23 + "#} + ); - Ok((workdir, pg_distrib_dir)) + test!( + disk_usage_based_eviction, + indoc! {r#" + [disk_usage_based_eviction] + some_invalid_field = 23 + "#} + ); + + test!( + tenant_config, + indoc! {r#" + [tenant_config] + some_invalid_field = 23 + "#} + ); + + test!( + l0_flush, + indoc! {r#" + [l0_flush] + mode = "direct" + some_invalid_field = 23 + "#} + ); + + // TODO: fix this => https://github.com/neondatabase/neon/issues/8915 + // test!( + // remote_storage_config, + // indoc! {r#" + // [remote_storage_config] + // local_path = "/nonexistent" + // some_invalid_field = 23 + // "#} + // ); + + test!( + compact_level0_phase1_value_access, + indoc! {r#" + [compact_level0_phase1_value_access] + mode = "streaming-kmerge" + some_invalid_field = 23 + "#} + ); } } diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index f94d945d46..64a267e0e4 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -1,6 +1,8 @@ //! Periodically collect consumption metrics for all active tenants //! and push them to a HTTP endpoint. use crate::config::PageServerConf; +use crate::consumption_metrics::metrics::MetricsKey; +use crate::consumption_metrics::upload::KeyGen as _; use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::size::CalculateSyntheticSizeError; @@ -8,6 +10,7 @@ use crate::tenant::tasks::BackgroundLoopKind; use crate::tenant::{mgr::TenantManager, LogicalSizeCalculationCause, Tenant}; use camino::Utf8PathBuf; use consumption_metrics::EventType; +use itertools::Itertools as _; use pageserver_api::models::TenantState; use remote_storage::{GenericRemoteStorage, RemoteStorageConfig}; use reqwest::Url; @@ -19,9 +22,8 @@ use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::NodeId; -mod metrics; -use crate::consumption_metrics::metrics::MetricsKey; mod disk_cache; +mod metrics; mod upload; const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); @@ -143,6 +145,12 @@ async fn collect_metrics( // these are point in time, with variable "now" let metrics = metrics::collect_all_metrics(&tenant_manager, &cached_metrics, &ctx).await; + // Pre-generate event idempotency keys, to reuse them across the bucket + // and HTTP sinks. + let idempotency_keys = std::iter::repeat_with(|| node_id.as_str().generate()) + .take(metrics.len()) + .collect_vec(); + let metrics = Arc::new(metrics); // why not race cancellation here? because we are one of the last tasks, and if we are @@ -161,8 +169,14 @@ async fn collect_metrics( } if let Some(bucket_client) = &bucket_client { - let res = - upload::upload_metrics_bucket(bucket_client, &cancel, &node_id, &metrics).await; + let res = upload::upload_metrics_bucket( + bucket_client, + &cancel, + &node_id, + &metrics, + &idempotency_keys, + ) + .await; if let Err(e) = res { tracing::error!("failed to upload to S3: {e:#}"); } @@ -174,9 +188,9 @@ async fn collect_metrics( &client, metric_collection_endpoint, &cancel, - &node_id, &metrics, &mut cached_metrics, + &idempotency_keys, ) .await; if let Err(e) = res { diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs index 4e8283c3e4..0325ee403a 100644 --- a/pageserver/src/consumption_metrics/upload.rs +++ b/pageserver/src/consumption_metrics/upload.rs @@ -24,16 +24,16 @@ pub(super) async fn upload_metrics_http( client: &reqwest::Client, metric_collection_endpoint: &reqwest::Url, cancel: &CancellationToken, - node_id: &str, metrics: &[RawMetric], cached_metrics: &mut Cache, + idempotency_keys: &[IdempotencyKey<'_>], ) -> anyhow::Result<()> { let mut uploaded = 0; let mut failed = 0; let started_at = std::time::Instant::now(); - let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, node_id); + let mut iter = serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys); while let Some(res) = iter.next() { let (chunk, body) = res?; @@ -87,6 +87,7 @@ pub(super) async fn upload_metrics_bucket( cancel: &CancellationToken, node_id: &str, metrics: &[RawMetric], + idempotency_keys: &[IdempotencyKey<'_>], ) -> anyhow::Result<()> { if metrics.is_empty() { // Skip uploads if we have no metrics, so that readers don't have to handle the edge case @@ -106,7 +107,7 @@ pub(super) async fn upload_metrics_bucket( // Serialize and write into compressed buffer let started_at = std::time::Instant::now(); - for res in serialize_in_chunks(CHUNK_SIZE, metrics, node_id) { + for res in serialize_in_chunks(CHUNK_SIZE, metrics, idempotency_keys) { let (_chunk, body) = res?; gzip_writer.write_all(&body).await?; } @@ -134,29 +135,31 @@ pub(super) async fn upload_metrics_bucket( Ok(()) } -// The return type is quite ugly, but we gain testability in isolation -fn serialize_in_chunks<'a, F>( +/// Serializes the input metrics as JSON in chunks of chunk_size. The provided +/// idempotency keys are injected into the corresponding metric events (reused +/// across different metrics sinks), and must have the same length as input. +fn serialize_in_chunks<'a>( chunk_size: usize, input: &'a [RawMetric], - factory: F, + idempotency_keys: &'a [IdempotencyKey<'a>], ) -> impl ExactSizeIterator> + 'a -where - F: KeyGen<'a> + 'a, { use bytes::BufMut; - struct Iter<'a, F> { + assert_eq!(input.len(), idempotency_keys.len()); + + struct Iter<'a> { inner: std::slice::Chunks<'a, RawMetric>, + idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>, chunk_size: usize, // write to a BytesMut so that we can cheaply clone the frozen Bytes for retries buffer: bytes::BytesMut, // chunk amount of events are reused to produce the serialized document scratch: Vec>, - factory: F, } - impl<'a, F: KeyGen<'a>> Iterator for Iter<'a, F> { + impl<'a> Iterator for Iter<'a> { type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>; fn next(&mut self) -> Option { @@ -167,17 +170,14 @@ where self.scratch.extend( chunk .iter() - .map(|raw_metric| raw_metric.as_event(&self.factory.generate())), + .zip(&mut self.idempotency_keys) + .map(|(raw_metric, key)| raw_metric.as_event(key)), ); } else { // next rounds: update_in_place to reuse allocations assert_eq!(self.scratch.len(), self.chunk_size); - self.scratch - .iter_mut() - .zip(chunk.iter()) - .for_each(|(slot, raw_metric)| { - raw_metric.update_in_place(slot, &self.factory.generate()) - }); + itertools::izip!(self.scratch.iter_mut(), chunk, &mut self.idempotency_keys) + .for_each(|(slot, raw_metric, key)| raw_metric.update_in_place(slot, key)); } let res = serde_json::to_writer( @@ -198,18 +198,19 @@ where } } - impl<'a, F: KeyGen<'a>> ExactSizeIterator for Iter<'a, F> {} + impl<'a> ExactSizeIterator for Iter<'a> {} let buffer = bytes::BytesMut::new(); let inner = input.chunks(chunk_size); + let idempotency_keys = idempotency_keys.iter(); let scratch = Vec::new(); Iter { inner, + idempotency_keys, chunk_size, buffer, scratch, - factory, } } @@ -268,7 +269,7 @@ impl RawMetricExt for RawMetric { } } -trait KeyGen<'a>: Copy { +pub(crate) trait KeyGen<'a> { fn generate(&self) -> IdempotencyKey<'a>; } @@ -389,7 +390,10 @@ mod tests { let examples = metric_samples(); assert!(examples.len() > 1); - let factory = FixedGen::new(Utc::now(), "1", 42); + let now = Utc::now(); + let idempotency_keys = (0..examples.len()) + .map(|i| FixedGen::new(now, "1", i as u16).generate()) + .collect::>(); // need to use Event here because serde_json::Value uses default hashmap, not linked // hashmap @@ -398,13 +402,13 @@ mod tests { events: Vec>, } - let correct = serialize_in_chunks(examples.len(), &examples, factory) + let correct = serialize_in_chunks(examples.len(), &examples, &idempotency_keys) .map(|res| res.unwrap().1) .flat_map(|body| serde_json::from_slice::(&body).unwrap().events) .collect::>(); for chunk_size in 1..examples.len() { - let actual = serialize_in_chunks(chunk_size, &examples, factory) + let actual = serialize_in_chunks(chunk_size, &examples, &idempotency_keys) .map(|res| res.unwrap().1) .flat_map(|body| serde_json::from_slice::(&body).unwrap().events) .collect::>(); diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs index 0b07e07524..012cb8d96f 100644 --- a/pageserver/src/context.rs +++ b/pageserver/src/context.rs @@ -105,8 +105,10 @@ pub struct RequestContext { #[derive(Clone, Copy, PartialEq, Eq, Debug, enum_map::Enum, strum_macros::IntoStaticStr)] pub enum PageContentKind { Unknown, + DeltaLayerSummary, DeltaLayerBtreeNode, DeltaLayerValue, + ImageLayerSummary, ImageLayerBtreeNode, ImageLayerValue, InMemoryLayer, diff --git a/pageserver/src/control_plane_client.rs b/pageserver/src/control_plane_client.rs index b5d9267d79..56a536c387 100644 --- a/pageserver/src/control_plane_client.rs +++ b/pageserver/src/control_plane_client.rs @@ -141,12 +141,18 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient { m.other ); + let az_id = m + .other + .get("availability_zone_id") + .and_then(|jv| jv.as_str().map(|str| str.to_owned())); + Some(NodeRegisterRequest { node_id: conf.id, listen_pg_addr: m.postgres_host, listen_pg_port: m.postgres_port, listen_http_addr: m.http_host, listen_http_port: m.http_port, + availability_zone_id: az_id, }) } Err(e) => { diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 5e4a49bc56..a58fa2c0b1 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -41,19 +41,15 @@ // - The `#[allow(dead_code)]` above various structs are to suppress warnings about only the Debug impl // reading these fields. We use the Debug impl for semi-structured logging, though. -use std::{ - sync::Arc, - time::{Duration, SystemTime}, -}; +use std::{sync::Arc, time::SystemTime}; use anyhow::Context; -use pageserver_api::shard::TenantShardId; +use pageserver_api::{config::DiskUsageEvictionTaskConfig, shard::TenantShardId}; use remote_storage::GenericRemoteStorage; -use serde::{Deserialize, Serialize}; +use serde::Serialize; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn, Instrument}; -use utils::serde_percent::Percent; use utils::{completion, id::TimelineId}; use crate::{ @@ -69,23 +65,9 @@ use crate::{ CancellableTask, DiskUsageEvictionTask, }; -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct DiskUsageEvictionTaskConfig { - pub max_usage_pct: Percent, - pub min_avail_bytes: u64, - #[serde(with = "humantime_serde")] - pub period: Duration, - #[cfg(feature = "testing")] - pub mock_statvfs: Option, - /// Select sorting for evicted layers - #[serde(default)] - pub eviction_order: EvictionOrder, -} - /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size` /// partitioning. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -#[serde(tag = "type", content = "args")] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum EvictionOrder { /// Order the layers to be evicted by how recently they have been accessed relatively within /// the set of resident layers of a tenant. @@ -96,23 +78,22 @@ pub enum EvictionOrder { /// we read tenants is deterministic. If we find the need to use this as `false`, we need /// to ensure nondeterminism by adding in a random number to break the /// `relative_last_activity==0.0` ties. - #[serde(default = "default_highest_layer_count_loses_first")] highest_layer_count_loses_first: bool, }, } -impl Default for EvictionOrder { - fn default() -> Self { - Self::RelativeAccessed { - highest_layer_count_loses_first: true, +impl From for EvictionOrder { + fn from(value: pageserver_api::config::EvictionOrder) -> Self { + match value { + pageserver_api::config::EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first, + } => Self::RelativeAccessed { + highest_layer_count_loses_first, + }, } } } -fn default_highest_layer_count_loses_first() -> bool { - true -} - impl EvictionOrder { fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) { use EvictionOrder::*; @@ -295,7 +276,7 @@ async fn disk_usage_eviction_task_iteration( storage, usage_pre, tenant_manager, - task_config.eviction_order, + task_config.eviction_order.into(), cancel, ) .await; @@ -1257,7 +1238,6 @@ mod filesystem_level_usage { #[test] fn max_usage_pct_pressure() { - use super::EvictionOrder; use super::Usage as _; use std::time::Duration; use utils::serde_percent::Percent; @@ -1269,7 +1249,7 @@ mod filesystem_level_usage { period: Duration::MAX, #[cfg(feature = "testing")] mock_statvfs: None, - eviction_order: EvictionOrder::default(), + eviction_order: pageserver_api::config::EvictionOrder::default(), }, total_bytes: 100_000, avail_bytes: 0, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index cbcc162b32..d645f3b7b6 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -324,6 +324,9 @@ impl From for ApiError { match value { NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()), Timeout => ApiError::Timeout("hit pageserver internal timeout".into()), + e @ HasArchivedParent(_) => { + ApiError::PreconditionFailed(e.to_string().into_boxed_str()) + } HasUnarchivedChildren(children) => ApiError::PreconditionFailed( format!( "Cannot archive timeline which has non-archived child timelines: {children:?}" @@ -465,7 +468,7 @@ async fn build_timeline_info_common( pg_version: timeline.pg_version, state, - is_archived, + is_archived: Some(is_archived), walreceiver_status, @@ -871,7 +874,10 @@ async fn get_timestamp_of_lsn_handler( match result { Some(time) => { - let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string(); + let time = format_rfc3339( + postgres_ffi::try_from_pg_timestamp(time).map_err(ApiError::InternalServerError)?, + ) + .to_string(); json_response(StatusCode::OK, time) } None => Err(ApiError::NotFound( @@ -1727,6 +1733,10 @@ async fn timeline_compact_handler( if Some(true) == parse_query_param::<_, bool>(&request, "enhanced_gc_bottom_most_compaction")? { flags |= CompactFlags::EnhancedGcBottomMostCompaction; } + if Some(true) == parse_query_param::<_, bool>(&request, "dry_run")? { + flags |= CompactFlags::DryRun; + } + let wait_until_uploaded = parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false); @@ -2066,7 +2076,7 @@ async fn disk_usage_eviction_run( evict_bytes: u64, #[serde(default)] - eviction_order: crate::disk_usage_eviction_task::EvictionOrder, + eviction_order: pageserver_api::config::EvictionOrder, } #[derive(Debug, Clone, Copy, serde::Serialize)] @@ -2102,7 +2112,7 @@ async fn disk_usage_eviction_run( &state.remote_storage, usage, &state.tenant_manager, - config.eviction_order, + config.eviction_order.into(), &cancel, ) .await; @@ -2344,6 +2354,20 @@ async fn put_io_engine_handler( json_response(StatusCode::OK, ()) } +async fn put_io_alignment_handler( + mut r: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + check_permission(&r, None)?; + let align: usize = json_request(&mut r).await?; + crate::virtual_file::set_io_buffer_alignment(align).map_err(|align| { + ApiError::PreconditionFailed( + format!("Requested io alignment ({align}) is not a power of two").into(), + ) + })?; + json_response(StatusCode::OK, ()) +} + /// Polled by control plane. /// /// See [`crate::utilization`]. @@ -3031,6 +3055,9 @@ pub fn make_router( |r| api_handler(r, timeline_collect_keyspace), ) .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler)) + .put("/v1/io_alignment", |r| { + api_handler(r, put_io_alignment_handler) + }) .put( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch", |r| api_handler(r, force_aux_policy_switch_handler), diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index ed409d3130..5a0894cd1b 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -19,6 +19,7 @@ use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; use crate::tenant::Timeline; use crate::walingest::WalIngest; +use crate::walrecord::decode_wal_record; use crate::walrecord::DecodedWALRecord; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants; @@ -310,11 +311,13 @@ async fn import_wal( let mut nrecords = 0; let mut modification = tline.begin_modification(last_lsn); - let mut decoded = DecodedWALRecord::default(); while last_lsn <= endpoint { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { + let mut decoded = DecodedWALRecord::default(); + decode_wal_record(recdata, &mut decoded, tline.pg_version)?; + walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx) + .ingest_record(decoded, lsn, &mut modification, ctx) .await?; WAL_INGEST.records_committed.inc(); @@ -449,11 +452,12 @@ pub async fn import_wal_from_tar( waldecoder.feed_bytes(&bytes[offset..]); let mut modification = tline.begin_modification(last_lsn); - let mut decoded = DecodedWALRecord::default(); while last_lsn <= end_lsn { if let Some((lsn, recdata)) = waldecoder.poll_decode()? { + let mut decoded = DecodedWALRecord::default(); + decode_wal_record(recdata, &mut decoded, tline.pg_version)?; walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx) + .ingest_record(decoded, lsn, &mut modification, ctx) .await?; modification.commit(ctx).await?; last_lsn = lsn; diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs index 313a7961a6..491c9fb96c 100644 --- a/pageserver/src/l0_flush.rs +++ b/pageserver/src/l0_flush.rs @@ -1,9 +1,7 @@ use std::{num::NonZeroUsize, sync::Arc}; -#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize)] -#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] +#[derive(Debug, PartialEq, Eq, Clone)] pub enum L0FlushConfig { - #[serde(rename_all = "snake_case")] Direct { max_concurrency: NonZeroUsize }, } @@ -16,6 +14,16 @@ impl Default for L0FlushConfig { } } +impl From for L0FlushConfig { + fn from(config: pageserver_api::models::L0FlushConfig) -> Self { + match config { + pageserver_api::models::L0FlushConfig::Direct { max_concurrency } => { + Self::Direct { max_concurrency } + } + } + } +} + #[derive(Clone)] pub struct L0FlushGlobalState(Arc); diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index dbfc9f3544..7a9cf495c7 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -16,6 +16,7 @@ pub mod l0_flush; use futures::{stream::FuturesUnordered, StreamExt}; pub use pageserver_api::keyspace; use tokio_util::sync::CancellationToken; +mod assert_u64_eq_usize; pub mod aux_file; pub mod metrics; pub mod page_cache; diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index cb1ab70147..39c6a6fb74 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -753,16 +753,21 @@ impl PageServerHandler { } if request_lsn < **latest_gc_cutoff_lsn { - // Check explicitly for INVALID just to get a less scary error message if the - // request is obviously bogus - return Err(if request_lsn == Lsn::INVALID { - PageStreamError::BadRequest("invalid LSN(0) in request".into()) - } else { - PageStreamError::BadRequest(format!( + let gc_info = &timeline.gc_info.read().unwrap(); + if !gc_info.leases.contains_key(&request_lsn) { + // The requested LSN is below gc cutoff and is not guarded by a lease. + + // Check explicitly for INVALID just to get a less scary error message if the + // request is obviously bogus + return Err(if request_lsn == Lsn::INVALID { + PageStreamError::BadRequest("invalid LSN(0) in request".into()) + } else { + PageStreamError::BadRequest(format!( "tried to request a page version that was garbage collected. requested at {} gc cutoff {}", request_lsn, **latest_gc_cutoff_lsn ).into()) - }); + }); + } } // Wait for WAL up to 'not_modified_since' to arrive, if necessary @@ -789,6 +794,8 @@ impl PageServerHandler { } } + /// Handles the lsn lease request. + /// If a lease cannot be obtained, the client will receive NULL. #[instrument(skip_all, fields(shard_id, %lsn))] async fn handle_make_lsn_lease( &mut self, @@ -811,19 +818,25 @@ impl PageServerHandler { .await?; set_tracing_field_shard_id(&timeline); - let lease = timeline.make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx)?; - let valid_until = lease - .valid_until - .duration_since(SystemTime::UNIX_EPOCH) - .map_err(|e| QueryError::Other(e.into()))?; + let lease = timeline + .make_lsn_lease(lsn, timeline.get_lsn_lease_length(), ctx) + .inspect_err(|e| { + warn!("{e}"); + }) + .ok(); + let valid_until_str = lease.map(|l| { + l.valid_until + .duration_since(SystemTime::UNIX_EPOCH) + .expect("valid_until is earlier than UNIX_EPOCH") + .as_millis() + .to_string() + }); + let bytes = valid_until_str.as_ref().map(|x| x.as_bytes()); pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col( b"valid_until", )]))? - .write_message_noflush(&BeMessage::DataRow(&[Some( - &valid_until.as_millis().to_be_bytes(), - )]))? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; + .write_message_noflush(&BeMessage::DataRow(&[bytes]))?; Ok(()) } diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index b7110d69b6..d28a214265 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -168,7 +168,9 @@ impl Timeline { DatadirModification { tline: self, pending_lsns: Vec::new(), - pending_updates: HashMap::new(), + pending_metadata_pages: HashMap::new(), + pending_data_pages: Vec::new(), + pending_zero_data_pages: Default::default(), pending_deletions: Vec::new(), pending_nblocks: 0, pending_directory_entries: Vec::new(), @@ -727,8 +729,12 @@ impl Timeline { let current_policy = self.last_aux_file_policy.load(); match current_policy { Some(AuxFilePolicy::V1) => { - warn!("this timeline is using deprecated aux file policy V1 (policy=V1)"); - self.list_aux_files_v1(lsn, ctx).await + let res = self.list_aux_files_v1(lsn, ctx).await?; + let empty_str = if res.is_empty() { ", empty" } else { "" }; + warn!( + "this timeline is using deprecated aux file policy V1 (policy=v1{empty_str})" + ); + Ok(res) } None => { let res = self.list_aux_files_v1(lsn, ctx).await?; @@ -1031,10 +1037,24 @@ pub struct DatadirModification<'a> { // The put-functions add the modifications here, and they are flushed to the // underlying key-value store by the 'finish' function. pending_lsns: Vec, - pending_updates: HashMap>, pending_deletions: Vec<(Range, Lsn)>, pending_nblocks: i64, + /// Metadata writes, indexed by key so that they can be read from not-yet-committed modifications + /// while ingesting subsequent records. See [`Self::is_data_key`] for the definition of 'metadata'. + pending_metadata_pages: HashMap>, + + /// Data writes, ready to be flushed into an ephemeral layer. See [`Self::is_data_key`] for + /// which keys are stored here. + pending_data_pages: Vec<(CompactKey, Lsn, usize, Value)>, + + // Sometimes during ingest, for example when extending a relation, we would like to write a zero page. However, + // if we encounter a write from postgres in the same wal record, we will drop this entry. + // + // Unlike other 'pending' fields, this does not last until the next call to commit(): it is flushed + // at the end of each wal record, and all these writes implicitly are at lsn Self::lsn + pending_zero_data_pages: HashSet, + /// For special "directory" keys that store key-value maps, track the size of the map /// if it was updated in this modification. pending_directory_entries: Vec<(DirectoryKind, usize)>, @@ -1058,6 +1078,10 @@ impl<'a> DatadirModification<'a> { self.pending_bytes } + pub(crate) fn has_dirty_data_pages(&self) -> bool { + (!self.pending_data_pages.is_empty()) || (!self.pending_zero_data_pages.is_empty()) + } + /// Set the current lsn pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> { ensure!( @@ -1066,6 +1090,10 @@ impl<'a> DatadirModification<'a> { lsn, self.lsn ); + + // If we are advancing LSN, then state from previous wal record should have been flushed. + assert!(self.pending_zero_data_pages.is_empty()); + if lsn > self.lsn { self.pending_lsns.push(self.lsn); self.lsn = lsn; @@ -1073,6 +1101,17 @@ impl<'a> DatadirModification<'a> { Ok(()) } + /// In this context, 'metadata' means keys that are only read by the pageserver internally, and 'data' means + /// keys that represent literal blocks that postgres can read. So data includes relation blocks and + /// SLRU blocks, which are read directly by postgres, and everything else is considered metadata. + /// + /// The distinction is important because data keys are handled on a fast path where dirty writes are + /// not readable until this modification is committed, whereas metadata keys are visible for read + /// via [`Self::get`] as soon as their record has been ingested. + fn is_data_key(key: &Key) -> bool { + key.is_rel_block_key() || key.is_slru_block_key() + } + /// Initialize a completely new repository. /// /// This inserts the directory metadata entries that are assumed to @@ -1180,6 +1219,31 @@ impl<'a> DatadirModification<'a> { Ok(()) } + pub(crate) fn put_rel_page_image_zero(&mut self, rel: RelTag, blknum: BlockNumber) { + self.pending_zero_data_pages + .insert(rel_block_to_key(rel, blknum).to_compact()); + self.pending_bytes += ZERO_PAGE.len(); + } + + pub(crate) fn put_slru_page_image_zero( + &mut self, + kind: SlruKind, + segno: u32, + blknum: BlockNumber, + ) { + self.pending_zero_data_pages + .insert(slru_block_to_key(kind, segno, blknum).to_compact()); + self.pending_bytes += ZERO_PAGE.len(); + } + + /// Call this at the end of each WAL record. + pub(crate) fn on_record_end(&mut self) { + let pending_zero_data_pages = std::mem::take(&mut self.pending_zero_data_pages); + for key in pending_zero_data_pages { + self.put_data(key, Value::Image(ZERO_PAGE.clone())); + } + } + /// Store a relmapper file (pg_filenode.map) in the repository pub async fn put_relmap_file( &mut self, @@ -1597,7 +1661,7 @@ impl<'a> DatadirModification<'a> { if aux_files_key_v1.is_empty() { None } else { - warn!("this timeline is using deprecated aux file policy V1"); + warn!("this timeline is using deprecated aux file policy V1 (detected existing v1 files)"); self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?; Some(AuxFilePolicy::V1) } @@ -1778,7 +1842,7 @@ impl<'a> DatadirModification<'a> { /// retains all the metadata, but data pages are flushed. That's again OK /// for bulk import, where you are just loading data pages and won't try to /// modify the same pages twice. - pub async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { + pub(crate) async fn flush(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { // Unless we have accumulated a decent amount of changes, it's not worth it // to scan through the pending_updates list. let pending_nblocks = self.pending_nblocks; @@ -1789,26 +1853,11 @@ impl<'a> DatadirModification<'a> { let mut writer = self.tline.writer().await; // Flush relation and SLRU data blocks, keep metadata. - let mut retained_pending_updates = HashMap::<_, Vec<_>>::new(); - for (key, values) in self.pending_updates.drain() { - let mut write_batch = Vec::new(); - for (lsn, value_ser_size, value) in values { - if key.is_rel_block_key() || key.is_slru_block_key() { - // This bails out on first error without modifying pending_updates. - // That's Ok, cf this function's doc comment. - write_batch.push((key.to_compact(), lsn, value_ser_size, value)); - } else { - retained_pending_updates.entry(key).or_default().push(( - lsn, - value_ser_size, - value, - )); - } - } - writer.put_batch(write_batch, ctx).await?; - } + let pending_data_pages = std::mem::take(&mut self.pending_data_pages); - self.pending_updates = retained_pending_updates; + // This bails out on first error without modifying pending_updates. + // That's Ok, cf this function's doc comment. + writer.put_batch(pending_data_pages, ctx).await?; self.pending_bytes = 0; if pending_nblocks != 0 { @@ -1829,26 +1878,31 @@ impl<'a> DatadirModification<'a> { /// All the modifications in this atomic update are stamped by the specified LSN. /// pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> { + // Commit should never be called mid-wal-record + assert!(self.pending_zero_data_pages.is_empty()); + let mut writer = self.tline.writer().await; let pending_nblocks = self.pending_nblocks; self.pending_nblocks = 0; - if !self.pending_updates.is_empty() { - // Ordering: the items in this batch do not need to be in any global order, but values for - // a particular Key must be in Lsn order relative to one another. InMemoryLayer relies on - // this to do efficient updates to its index. - let batch: Vec<(CompactKey, Lsn, usize, Value)> = self - .pending_updates + // Ordering: the items in this batch do not need to be in any global order, but values for + // a particular Key must be in Lsn order relative to one another. InMemoryLayer relies on + // this to do efficient updates to its index. + let mut write_batch = std::mem::take(&mut self.pending_data_pages); + + write_batch.extend( + self.pending_metadata_pages .drain() .flat_map(|(key, values)| { - values.into_iter().map(move |(lsn, val_ser_size, value)| { - (key.to_compact(), lsn, val_ser_size, value) - }) - }) - .collect::>(); + values + .into_iter() + .map(move |(lsn, value_size, value)| (key, lsn, value_size, value)) + }), + ); - writer.put_batch(batch, ctx).await?; + if !write_batch.is_empty() { + writer.put_batch(write_batch, ctx).await?; } if !self.pending_deletions.is_empty() { @@ -1879,33 +1933,58 @@ impl<'a> DatadirModification<'a> { } pub(crate) fn len(&self) -> usize { - self.pending_updates.len() + self.pending_deletions.len() + self.pending_metadata_pages.len() + + self.pending_data_pages.len() + + self.pending_deletions.len() } - // Internal helper functions to batch the modifications - + /// Read a page from the Timeline we are writing to. For metadata pages, this passes through + /// a cache in Self, which makes writes earlier in this modification visible to WAL records later + /// in the modification. + /// + /// For data pages, reads pass directly to the owning Timeline: any ingest code which reads a data + /// page must ensure that the pages they read are already committed in Timeline, for example + /// DB create operations are always preceded by a call to commit(). This is special cased because + /// it's rare: all the 'normal' WAL operations will only read metadata pages such as relation sizes, + /// and not data pages. async fn get(&self, key: Key, ctx: &RequestContext) -> Result { - // Have we already updated the same key? Read the latest pending updated - // version in that case. - // - // Note: we don't check pending_deletions. It is an error to request a - // value that has been removed, deletion only avoids leaking storage. - if let Some(values) = self.pending_updates.get(&key) { - if let Some((_, _, value)) = values.last() { - return if let Value::Image(img) = value { - Ok(img.clone()) - } else { - // Currently, we never need to read back a WAL record that we - // inserted in the same "transaction". All the metadata updates - // work directly with Images, and we never need to read actual - // data pages. We could handle this if we had to, by calling - // the walredo manager, but let's keep it simple for now. - Err(PageReconstructError::Other(anyhow::anyhow!( - "unexpected pending WAL record" - ))) - }; + if !Self::is_data_key(&key) { + // Have we already updated the same key? Read the latest pending updated + // version in that case. + // + // Note: we don't check pending_deletions. It is an error to request a + // value that has been removed, deletion only avoids leaking storage. + if let Some(values) = self.pending_metadata_pages.get(&key.to_compact()) { + if let Some((_, _, value)) = values.last() { + return if let Value::Image(img) = value { + Ok(img.clone()) + } else { + // Currently, we never need to read back a WAL record that we + // inserted in the same "transaction". All the metadata updates + // work directly with Images, and we never need to read actual + // data pages. We could handle this if we had to, by calling + // the walredo manager, but let's keep it simple for now. + Err(PageReconstructError::Other(anyhow::anyhow!( + "unexpected pending WAL record" + ))) + }; + } + } + } else { + // This is an expensive check, so we only do it in debug mode. If reading a data key, + // this key should never be present in pending_data_pages. We ensure this by committing + // modifications before ingesting DB create operations, which are the only kind that reads + // data pages during ingest. + if cfg!(debug_assertions) { + for (dirty_key, _, _, _) in &self.pending_data_pages { + debug_assert!(&key.to_compact() != dirty_key); + } + + debug_assert!(!self.pending_zero_data_pages.contains(&key.to_compact())) } } + + // Metadata page cache miss, or we're reading a data page. let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn); self.tline.get(key, lsn, ctx).await } @@ -1917,11 +1996,40 @@ impl<'a> DatadirModification<'a> { } fn put(&mut self, key: Key, val: Value) { - let values = self.pending_updates.entry(key).or_default(); + if Self::is_data_key(&key) { + self.put_data(key.to_compact(), val) + } else { + self.put_metadata(key.to_compact(), val) + } + } + + fn put_data(&mut self, key: CompactKey, val: Value) { + let val_serialized_size = val.serialized_size().unwrap() as usize; + + // If this page was previously zero'd in the same WalRecord, then drop the previous zero page write. This + // is an optimization that avoids persisting both the zero page generated by us (e.g. during a relation extend), + // and the subsequent postgres-originating write + if self.pending_zero_data_pages.remove(&key) { + self.pending_bytes -= ZERO_PAGE.len(); + } + + self.pending_bytes += val_serialized_size; + self.pending_data_pages + .push((key, self.lsn, val_serialized_size, val)) + } + + fn put_metadata(&mut self, key: CompactKey, val: Value) { + let values = self.pending_metadata_pages.entry(key).or_default(); // Replace the previous value if it exists at the same lsn if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() { if *last_lsn == self.lsn { + // Update the pending_bytes contribution from this entry, and update the serialized size in place + self.pending_bytes -= *last_value_ser_size; *last_value_ser_size = val.serialized_size().unwrap() as usize; + self.pending_bytes += *last_value_ser_size; + + // Use the latest value, this replaces any earlier write to the same (key,lsn), such as much + // have been generated by synthesized zero page writes prior to the first real write to a page. *last_value = val; return; } diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs index ede1791afa..5a6f6e5176 100644 --- a/pageserver/src/statvfs.rs +++ b/pageserver/src/statvfs.rs @@ -60,32 +60,7 @@ pub mod mock { use regex::Regex; use tracing::log::info; - #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] - #[serde(tag = "type")] - pub enum Behavior { - Success { - blocksize: u64, - total_blocks: u64, - name_filter: Option, - }, - Failure { - mocked_error: MockedError, - }, - } - - #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)] - #[allow(clippy::upper_case_acronyms)] - pub enum MockedError { - EIO, - } - - impl From for nix::Error { - fn from(e: MockedError) -> Self { - match e { - MockedError::EIO => nix::Error::EIO, - } - } - } + pub use pageserver_api::config::statvfs::mock::Behavior; pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result { info!("running mocked statvfs"); @@ -116,6 +91,7 @@ pub mod mock { block_size: *blocksize, }) } + #[cfg(feature = "testing")] Behavior::Failure { mocked_error } => Err((*mocked_error).into()), } } diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index ed9e001fd2..6a4e90dd55 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -146,6 +146,12 @@ impl FromStr for TokioRuntimeMode { } } +static TOKIO_THREAD_STACK_SIZE: Lazy = Lazy::new(|| { + env::var("NEON_PAGESERVER_TOKIO_THREAD_STACK_SIZE") + // the default 2MiB are insufficent, especially in debug mode + .unwrap_or_else(|| NonZeroUsize::new(4 * 1024 * 1024).unwrap()) +}); + static ONE_RUNTIME: Lazy> = Lazy::new(|| { let thread_name = "pageserver-tokio"; let Some(mode) = env::var("NEON_PAGESERVER_USE_ONE_RUNTIME") else { @@ -164,6 +170,7 @@ static ONE_RUNTIME: Lazy> = Lazy::new(|| { tokio::runtime::Builder::new_current_thread() .thread_name(thread_name) .enable_all() + .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get()) .build() .expect("failed to create one single runtime") } @@ -173,6 +180,7 @@ static ONE_RUNTIME: Lazy> = Lazy::new(|| { .thread_name(thread_name) .enable_all() .worker_threads(num_workers.get()) + .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get()) .build() .expect("failed to create one multi-threaded runtime") } @@ -199,6 +207,7 @@ macro_rules! pageserver_runtime { .thread_name($name) .worker_threads(TOKIO_WORKER_THREADS.get()) .enable_all() + .thread_stack_size(TOKIO_THREAD_STACK_SIZE.get()) .build() .expect(std::concat!("Failed to create runtime ", $name)) }); diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 0364d521b6..fb30857ddf 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -509,6 +509,9 @@ pub enum TimelineArchivalError { #[error("Timeout")] Timeout, + #[error("ancestor is archived: {}", .0)] + HasArchivedParent(TimelineId), + #[error("HasUnarchivedChildren")] HasUnarchivedChildren(Vec), @@ -524,6 +527,7 @@ impl Debug for TimelineArchivalError { match self { Self::NotFound => write!(f, "NotFound"), Self::Timeout => write!(f, "Timeout"), + Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(), Self::HasUnarchivedChildren(c) => { f.debug_tuple("HasUnarchivedChildren").field(c).finish() } @@ -877,6 +881,12 @@ impl Tenant { }); }; + // TODO: should also be rejecting tenant conf changes that violate this check. + if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) { + make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error); + return Ok(()); + } + let mut init_order = init_order; // take the completion because initial tenant loading will complete when all of // these tasks complete. @@ -1363,11 +1373,20 @@ impl Tenant { let timeline = { let timelines = self.timelines.lock().unwrap(); - let timeline = match timelines.get(&timeline_id) { - Some(t) => t, - None => return Err(TimelineArchivalError::NotFound), + let Some(timeline) = timelines.get(&timeline_id) else { + return Err(TimelineArchivalError::NotFound); }; + if state == TimelineArchivalState::Unarchived { + if let Some(ancestor_timeline) = timeline.ancestor_timeline() { + if ancestor_timeline.is_archived() == Some(true) { + return Err(TimelineArchivalError::HasArchivedParent( + ancestor_timeline.timeline_id, + )); + } + } + } + // Ensure that there are no non-archived child timelines let children: Vec = timelines .iter() diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index a245c99a88..dd70f6bbff 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -148,7 +148,7 @@ pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0; /// The maximum size of blobs we support. The highest few bits /// are reserved for compression and other further uses. -const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff; +pub(crate) const MAX_SUPPORTED_BLOB_LEN: usize = 0x0fff_ffff; pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80; pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10; @@ -326,7 +326,7 @@ impl BlobWriter { (self.write_all(io_buf.slice_len(), ctx).await, srcbuf) } else { // Write a 4-byte length header - if len > MAX_SUPPORTED_LEN { + if len > MAX_SUPPORTED_BLOB_LEN { return ( ( io_buf.slice_len(), diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 601b095155..3afa3a86b9 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -2,7 +2,6 @@ //! Low-level Block-oriented I/O functions //! -use super::ephemeral_file::EphemeralFile; use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner}; use crate::context::RequestContext; use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ}; @@ -81,9 +80,7 @@ impl<'a> Deref for BlockLease<'a> { /// Unlike traits, we also support the read function to be async though. pub(crate) enum BlockReaderRef<'a> { FileBlockReader(&'a FileBlockReader<'a>), - EphemeralFile(&'a EphemeralFile), Adapter(Adapter<&'a DeltaLayerInner>), - Slice(&'a [u8]), #[cfg(test)] TestDisk(&'a super::disk_btree::tests::TestDisk), #[cfg(test)] @@ -100,9 +97,7 @@ impl<'a> BlockReaderRef<'a> { use BlockReaderRef::*; match self { FileBlockReader(r) => r.read_blk(blknum, ctx).await, - EphemeralFile(r) => r.read_blk(blknum, ctx).await, Adapter(r) => r.read_blk(blknum, ctx).await, - Slice(s) => Self::read_blk_slice(s, blknum), #[cfg(test)] TestDisk(r) => r.read_blk(blknum), #[cfg(test)] @@ -111,24 +106,6 @@ impl<'a> BlockReaderRef<'a> { } } -impl<'a> BlockReaderRef<'a> { - fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result { - let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap(); - let end = start.checked_add(PAGE_SZ).unwrap(); - if end > slice.len() { - return Err(std::io::Error::new( - std::io::ErrorKind::UnexpectedEof, - format!("slice too short, len={} end={}", slice.len(), end), - )); - } - let slice = &slice[start..end]; - let page_sized: &[u8; PAGE_SZ] = slice - .try_into() - .expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ"); - Ok(BlockLease::Slice(page_sized)) - } -} - /// /// A "cursor" for efficiently reading multiple pages from a BlockReader /// diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 48ff17db94..7e0344666b 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -9,11 +9,10 @@ //! may lead to a data loss. //! use anyhow::bail; +pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf; use pageserver_api::models::AuxFilePolicy; -use pageserver_api::models::CompactionAlgorithm; use pageserver_api::models::CompactionAlgorithmSettings; use pageserver_api::models::EvictionPolicy; -use pageserver_api::models::LsnLease; use pageserver_api::models::{self, ThrottleConfig}; use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}; use serde::de::IntoDeserializer; @@ -23,50 +22,6 @@ use std::num::NonZeroU64; use std::time::Duration; use utils::generation::Generation; -pub mod defaults { - - // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB - // would be more appropriate. But a low value forces the code to be exercised more, - // which is good for now to trigger bugs. - // This parameter actually determines L0 layer file size. - pub const DEFAULT_CHECKPOINT_DISTANCE: u64 = 256 * 1024 * 1024; - pub const DEFAULT_CHECKPOINT_TIMEOUT: &str = "10 m"; - - // FIXME the below configs are only used by legacy algorithm. The new algorithm - // has different parameters. - - // Target file size, when creating image and delta layers. - // This parameter determines L1 layer file size. - pub const DEFAULT_COMPACTION_TARGET_SIZE: u64 = 128 * 1024 * 1024; - - pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s"; - pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10; - pub const DEFAULT_COMPACTION_ALGORITHM: super::CompactionAlgorithm = - super::CompactionAlgorithm::Legacy; - - pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024; - - // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger. - // If there's a need to decrease this value, first make sure that GC - // doesn't hold a layer map write lock for non-trivial operations. - // Relevant: https://github.com/neondatabase/neon/issues/3394 - pub const DEFAULT_GC_PERIOD: &str = "1 hr"; - pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3; - pub const DEFAULT_PITR_INTERVAL: &str = "7 days"; - pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds"; - pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds"; - // The default limit on WAL lag should be set to avoid causing disconnects under high throughput - // scenarios: since the broker stats are updated ~1/s, a value of 1GiB should be sufficient for - // throughputs up to 1GiB/s per timeline. - pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 1024 * 1024 * 1024; - pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour"; - // By default ingest enough WAL for two new L0 layers before checking if new image - // image layers should be created. - pub const DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD: u8 = 2; - - pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; -} - #[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq, Eq)] pub(crate) enum AttachmentMode { /// Our generation is current as far as we know, and as far as we know we are the only attached @@ -281,96 +236,20 @@ impl LocationConf { } } -/// A tenant's calcuated configuration, which is the result of merging a -/// tenant's TenantConfOpt with the global TenantConf from PageServerConf. -/// -/// For storing and transmitting individual tenant's configuration, see -/// TenantConfOpt. -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct TenantConf { - // Flush out an inmemory layer, if it's holding WAL older than this - // This puts a backstop on how much WAL needs to be re-digested if the - // page server crashes. - // This parameter actually determines L0 layer file size. - pub checkpoint_distance: u64, - // Inmemory layer is also flushed at least once in checkpoint_timeout to - // eventually upload WAL after activity is stopped. - #[serde(with = "humantime_serde")] - pub checkpoint_timeout: Duration, - // Target file size, when creating image and delta layers. - // This parameter determines L1 layer file size. - pub compaction_target_size: u64, - // How often to check if there's compaction work to be done. - // Duration::ZERO means automatic compaction is disabled. - #[serde(with = "humantime_serde")] - pub compaction_period: Duration, - // Level0 delta layer threshold for compaction. - pub compaction_threshold: usize, - pub compaction_algorithm: CompactionAlgorithmSettings, - // Determines how much history is retained, to allow - // branching and read replicas at an older point in time. - // The unit is #of bytes of WAL. - // Page versions older than this are garbage collected away. - pub gc_horizon: u64, - // Interval at which garbage collection is triggered. - // Duration::ZERO means automatic GC is disabled - #[serde(with = "humantime_serde")] - pub gc_period: Duration, - // Delta layer churn threshold to create L1 image layers. - pub image_creation_threshold: usize, - // Determines how much history is retained, to allow - // branching and read replicas at an older point in time. - // The unit is time. - // Page versions older than this are garbage collected away. - #[serde(with = "humantime_serde")] - pub pitr_interval: Duration, - /// Maximum amount of time to wait while opening a connection to receive wal, before erroring. - #[serde(with = "humantime_serde")] - pub walreceiver_connect_timeout: Duration, - /// Considers safekeepers stalled after no WAL updates were received longer than this threshold. - /// A stalled safekeeper will be changed to a newer one when it appears. - #[serde(with = "humantime_serde")] - pub lagging_wal_timeout: Duration, - /// Considers safekeepers lagging when their WAL is behind another safekeeper for more than this threshold. - /// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update, - /// to avoid eager reconnects. - pub max_lsn_wal_lag: NonZeroU64, - pub eviction_policy: EvictionPolicy, - pub min_resident_size_override: Option, - // See the corresponding metric's help string. - #[serde(with = "humantime_serde")] - pub evictions_low_residence_duration_metric_threshold: Duration, - - /// If non-zero, the period between uploads of a heatmap from attached tenants. This - /// may be disabled if a Tenant will not have secondary locations: only secondary - /// locations will use the heatmap uploaded by attached locations. - #[serde(with = "humantime_serde")] - pub heatmap_period: Duration, - - /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup - pub lazy_slru_download: bool, - - pub timeline_get_throttle: pageserver_api::models::ThrottleConfig, - - // How much WAL must be ingested before checking again whether a new image layer is required. - // Expresed in multiples of checkpoint distance. - pub image_layer_creation_check_threshold: u8, - - /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into - /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions. - /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux - /// file is written. - pub switch_aux_file_policy: AuxFilePolicy, - - /// The length for an explicit LSN lease request. - /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. - #[serde(with = "humantime_serde")] - pub lsn_lease_length: Duration, - - /// The length for an implicit LSN lease granted as part of `get_lsn_by_timestamp` request. - /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. - #[serde(with = "humantime_serde")] - pub lsn_lease_length_for_ts: Duration, +impl Default for LocationConf { + // TODO: this should be removed once tenant loading can guarantee that we are never + // loading from a directory without a configuration. + // => tech debt since https://github.com/neondatabase/neon/issues/1555 + fn default() -> Self { + Self { + mode: LocationMode::Attached(AttachedLocationConfig { + generation: Generation::none(), + attach_mode: AttachmentMode::Single, + }), + tenant_conf: TenantConfOpt::default(), + shard: ShardIdentity::unsharded(), + } + } } /// Same as TenantConf, but this struct preserves the information about @@ -545,51 +424,6 @@ impl TenantConfOpt { } } -impl Default for TenantConf { - fn default() -> Self { - use defaults::*; - Self { - checkpoint_distance: DEFAULT_CHECKPOINT_DISTANCE, - checkpoint_timeout: humantime::parse_duration(DEFAULT_CHECKPOINT_TIMEOUT) - .expect("cannot parse default checkpoint timeout"), - compaction_target_size: DEFAULT_COMPACTION_TARGET_SIZE, - compaction_period: humantime::parse_duration(DEFAULT_COMPACTION_PERIOD) - .expect("cannot parse default compaction period"), - compaction_threshold: DEFAULT_COMPACTION_THRESHOLD, - compaction_algorithm: CompactionAlgorithmSettings { - kind: DEFAULT_COMPACTION_ALGORITHM, - }, - gc_horizon: DEFAULT_GC_HORIZON, - gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD) - .expect("cannot parse default gc period"), - image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD, - pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL) - .expect("cannot parse default PITR interval"), - walreceiver_connect_timeout: humantime::parse_duration( - DEFAULT_WALRECEIVER_CONNECT_TIMEOUT, - ) - .expect("cannot parse default walreceiver connect timeout"), - lagging_wal_timeout: humantime::parse_duration(DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT) - .expect("cannot parse default walreceiver lagging wal timeout"), - max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG) - .expect("cannot parse default max walreceiver Lsn wal lag"), - eviction_policy: EvictionPolicy::NoEviction, - min_resident_size_override: None, - evictions_low_residence_duration_metric_threshold: humantime::parse_duration( - DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD, - ) - .expect("cannot parse default evictions_low_residence_duration_metric_threshold"), - heatmap_period: Duration::ZERO, - lazy_slru_download: false, - timeline_get_throttle: crate::tenant::throttle::Config::disabled(), - image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, - switch_aux_file_policy: AuxFilePolicy::default_tenant_config(), - lsn_lease_length: LsnLease::DEFAULT_LENGTH, - lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, - } - } -} - impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt { type Error = anyhow::Error; diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 44f0fc7ab1..5324e1807d 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -1,13 +1,21 @@ //! Implementation of append-only file data structure //! used to keep in-memory layers spilled on disk. +use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}; use crate::config::PageServerConf; use crate::context::RequestContext; use crate::page_cache; -use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader}; -use crate::virtual_file::{self, VirtualFile}; +use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File; +use crate::virtual_file::owned_buffers_io::slice::SliceMutExt; +use crate::virtual_file::owned_buffers_io::util::size_tracking_writer; +use crate::virtual_file::owned_buffers_io::write::Buffer; +use crate::virtual_file::{self, owned_buffers_io, VirtualFile}; +use bytes::BytesMut; use camino::Utf8PathBuf; +use num_traits::Num; use pageserver_api::shard::TenantShardId; +use tokio_epoll_uring::{BoundedBuf, Slice}; +use tracing::error; use std::io; use std::sync::atomic::AtomicU64; @@ -16,12 +24,17 @@ use utils::id::TimelineId; pub struct EphemeralFile { _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, - - rw: page_caching::RW, + page_cache_file_id: page_cache::FileId, + bytes_written: u64, + buffered_writer: owned_buffers_io::write::BufferedWriter< + BytesMut, + size_tracking_writer::Writer, + >, + /// Gate guard is held on as long as we need to do operations in the path (delete on drop) + _gate_guard: utils::sync::gate::GateGuard, } -mod page_caching; -mod zero_padded_read_write; +const TAIL_SZ: usize = 64 * 1024; impl EphemeralFile { pub async fn create( @@ -51,75 +64,178 @@ impl EphemeralFile { ) .await?; + let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore + Ok(EphemeralFile { _tenant_shard_id: tenant_shard_id, _timeline_id: timeline_id, - rw: page_caching::RW::new(file, gate_guard), + page_cache_file_id, + bytes_written: 0, + buffered_writer: owned_buffers_io::write::BufferedWriter::new( + size_tracking_writer::Writer::new(file), + BytesMut::with_capacity(TAIL_SZ), + ), + _gate_guard: gate_guard, }) } +} +impl Drop for EphemeralFile { + fn drop(&mut self) { + // unlink the file + // we are clear to do this, because we have entered a gate + let path = &self.buffered_writer.as_inner().as_inner().path; + let res = std::fs::remove_file(path); + if let Err(e) = res { + if e.kind() != std::io::ErrorKind::NotFound { + // just never log the not found errors, we cannot do anything for them; on detach + // the tenant directory is already gone. + // + // not found files might also be related to https://github.com/neondatabase/neon/issues/2442 + error!("could not remove ephemeral file '{path}': {e}"); + } + } + } +} + +impl EphemeralFile { pub(crate) fn len(&self) -> u64 { - self.rw.bytes_written() + self.bytes_written } pub(crate) fn page_cache_file_id(&self) -> page_cache::FileId { - self.rw.page_cache_file_id() + self.page_cache_file_id } - /// See [`self::page_caching::RW::load_to_vec`]. pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result, io::Error> { - self.rw.load_to_vec(ctx).await - } - - pub(crate) async fn read_blk( - &self, - blknum: u32, - ctx: &RequestContext, - ) -> Result { - self.rw.read_blk(blknum, ctx).await - } - - #[cfg(test)] - // This is a test helper: outside of tests, we are always written to via a pre-serialized batch. - pub(crate) async fn write_blob( - &mut self, - srcbuf: &[u8], - ctx: &RequestContext, - ) -> Result { - let pos = self.rw.bytes_written(); - - let mut len_bytes = std::io::Cursor::new(Vec::new()); - crate::tenant::storage_layer::inmemory_layer::SerializedBatch::write_blob_length( - srcbuf.len(), - &mut len_bytes, - ); - let len_bytes = len_bytes.into_inner(); - - // Write the length field - self.rw.write_all_borrowed(&len_bytes, ctx).await?; - - // Write the payload - self.rw.write_all_borrowed(srcbuf, ctx).await?; - - Ok(pos) + let size = self.len().into_usize(); + let vec = Vec::with_capacity(size); + let (slice, nread) = self.read_exact_at_eof_ok(0, vec.slice_full(), ctx).await?; + assert_eq!(nread, size); + let vec = slice.into_inner(); + assert_eq!(vec.len(), nread); + assert_eq!(vec.capacity(), size, "we shouldn't be reallocating"); + Ok(vec) } /// Returns the offset at which the first byte of the input was written, for use /// in constructing indices over the written value. + /// + /// Panics if the write is short because there's no way we can recover from that. + /// TODO: make upstack handle this as an error. pub(crate) async fn write_raw( &mut self, srcbuf: &[u8], ctx: &RequestContext, - ) -> Result { - let pos = self.rw.bytes_written(); + ) -> std::io::Result { + let pos = self.bytes_written; + + let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| { + std::io::Error::new( + std::io::ErrorKind::Other, + format!( + "write would grow EphemeralFile beyond u64::MAX: len={pos} writen={srcbuf_len}", + srcbuf_len = srcbuf.len(), + ), + ) + })?; // Write the payload - self.rw.write_all_borrowed(srcbuf, ctx).await?; + let nwritten = self + .buffered_writer + .write_buffered_borrowed(srcbuf, ctx) + .await?; + assert_eq!( + nwritten, + srcbuf.len(), + "buffered writer has no short writes" + ); + + self.bytes_written = new_bytes_written; Ok(pos) } } +impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile { + async fn read_exact_at_eof_ok<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>( + &'b self, + start: u64, + dst: tokio_epoll_uring::Slice, + ctx: &'a RequestContext, + ) -> std::io::Result<(tokio_epoll_uring::Slice, usize)> { + let file_size_tracking_writer = self.buffered_writer.as_inner(); + let flushed_offset = file_size_tracking_writer.bytes_written(); + + let buffer = self.buffered_writer.inspect_buffer(); + let buffered = &buffer[0..buffer.pending()]; + + let dst_cap = dst.bytes_total().into_u64(); + let end = { + // saturating_add is correct here because the max file size is u64::MAX, so, + // if start + dst.len() > u64::MAX, then we know it will be a short read + let mut end: u64 = start.saturating_add(dst_cap); + if end > self.bytes_written { + end = self.bytes_written; + } + end + }; + + // inclusive, exclusive + #[derive(Debug)] + struct Range(N, N); + impl Range { + fn len(&self) -> N { + if self.0 > self.1 { + N::zero() + } else { + self.1 - self.0 + } + } + } + let written_range = Range(start, std::cmp::min(end, flushed_offset)); + let buffered_range = Range(std::cmp::max(start, flushed_offset), end); + + let dst = if written_range.len() > 0 { + let file: &VirtualFile = file_size_tracking_writer.as_inner(); + let bounds = dst.bounds(); + let slice = file + .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx) + .await?; + Slice::from_buf_bounds(Slice::into_inner(slice), bounds) + } else { + dst + }; + + let dst = if buffered_range.len() > 0 { + let offset_in_buffer = buffered_range + .0 + .checked_sub(flushed_offset) + .unwrap() + .into_usize(); + let to_copy = + &buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())]; + let bounds = dst.bounds(); + let mut view = dst.slice({ + let start = written_range.len().into_usize(); + let end = start + .checked_add(buffered_range.len().into_usize()) + .unwrap(); + start..end + }); + view.as_mut_rust_slice_full_zeroed() + .copy_from_slice(to_copy); + Slice::from_buf_bounds(Slice::into_inner(view), bounds) + } else { + dst + }; + + // TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs + + Ok((dst, (end - start).into_usize())) + } +} + /// Does the given filename look like an ephemeral file? pub fn is_ephemeral_file(filename: &str) -> bool { if let Some(rest) = filename.strip_prefix("ephemeral-") { @@ -129,19 +245,13 @@ pub fn is_ephemeral_file(filename: &str) -> bool { } } -impl BlockReader for EphemeralFile { - fn block_cursor(&self) -> super::block_io::BlockCursor<'_> { - BlockCursor::new(super::block_io::BlockReaderRef::EphemeralFile(self)) - } -} - #[cfg(test)] mod tests { + use rand::Rng; + use super::*; use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; - use crate::tenant::block_io::BlockReaderRef; - use rand::{thread_rng, RngCore}; use std::fs; use std::str::FromStr; @@ -172,69 +282,6 @@ mod tests { Ok((conf, tenant_shard_id, timeline_id, ctx)) } - #[tokio::test] - async fn test_ephemeral_blobs() -> Result<(), io::Error> { - let (conf, tenant_id, timeline_id, ctx) = harness("ephemeral_blobs")?; - - let gate = utils::sync::gate::Gate::default(); - - let entered = gate.enter().unwrap(); - - let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, entered, &ctx).await?; - - let pos_foo = file.write_blob(b"foo", &ctx).await?; - assert_eq!( - b"foo", - file.block_cursor() - .read_blob(pos_foo, &ctx) - .await? - .as_slice() - ); - let pos_bar = file.write_blob(b"bar", &ctx).await?; - assert_eq!( - b"foo", - file.block_cursor() - .read_blob(pos_foo, &ctx) - .await? - .as_slice() - ); - assert_eq!( - b"bar", - file.block_cursor() - .read_blob(pos_bar, &ctx) - .await? - .as_slice() - ); - - let mut blobs = Vec::new(); - for i in 0..10000 { - let data = Vec::from(format!("blob{}", i).as_bytes()); - let pos = file.write_blob(&data, &ctx).await?; - blobs.push((pos, data)); - } - // also test with a large blobs - for i in 0..100 { - let data = format!("blob{}", i).as_bytes().repeat(100); - let pos = file.write_blob(&data, &ctx).await?; - blobs.push((pos, data)); - } - - let cursor = BlockCursor::new(BlockReaderRef::EphemeralFile(&file)); - for (pos, expected) in blobs { - let actual = cursor.read_blob(pos, &ctx).await?; - assert_eq!(actual, expected); - } - - // Test a large blob that spans multiple pages - let mut large_data = vec![0; 20000]; - thread_rng().fill_bytes(&mut large_data); - let pos_large = file.write_blob(&large_data, &ctx).await?; - let result = file.block_cursor().read_blob(pos_large, &ctx).await?; - assert_eq!(result, large_data); - - Ok(()) - } - #[tokio::test] async fn ephemeral_file_holds_gate_open() { const FOREVER: std::time::Duration = std::time::Duration::from_secs(5); @@ -268,4 +315,151 @@ mod tests { .expect("closing completes right away") .expect("closing does not panic"); } + + #[tokio::test] + async fn test_ephemeral_file_basics() { + let (conf, tenant_id, timeline_id, ctx) = harness("test_ephemeral_file_basics").unwrap(); + + let gate = utils::sync::gate::Gate::default(); + + let mut file = + EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) + .await + .unwrap(); + + let cap = file.buffered_writer.inspect_buffer().capacity(); + + let write_nbytes = cap + cap / 2; + + let content: Vec = rand::thread_rng() + .sample_iter(rand::distributions::Standard) + .take(write_nbytes) + .collect(); + + let mut value_offsets = Vec::new(); + for i in 0..write_nbytes { + let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap(); + value_offsets.push(off); + } + + assert!(file.len() as usize == write_nbytes); + for i in 0..write_nbytes { + assert_eq!(value_offsets[i], i.into_u64()); + let buf = Vec::with_capacity(1); + let (buf_slice, nread) = file + .read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx) + .await + .unwrap(); + let buf = buf_slice.into_inner(); + assert_eq!(nread, 1); + assert_eq!(&buf, &content[i..i + 1]); + } + + let file_contents = + std::fs::read(&file.buffered_writer.as_inner().as_inner().path).unwrap(); + assert_eq!(file_contents, &content[0..cap]); + + let buffer_contents = file.buffered_writer.inspect_buffer(); + assert_eq!(buffer_contents, &content[cap..write_nbytes]); + } + + #[tokio::test] + async fn test_flushes_do_happen() { + let (conf, tenant_id, timeline_id, ctx) = harness("test_flushes_do_happen").unwrap(); + + let gate = utils::sync::gate::Gate::default(); + + let mut file = + EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) + .await + .unwrap(); + + let cap = file.buffered_writer.inspect_buffer().capacity(); + + let content: Vec = rand::thread_rng() + .sample_iter(rand::distributions::Standard) + .take(cap + cap / 2) + .collect(); + + file.write_raw(&content, &ctx).await.unwrap(); + + // assert the state is as this test expects it to be + assert_eq!( + &file.load_to_vec(&ctx).await.unwrap(), + &content[0..cap + cap / 2] + ); + let md = file + .buffered_writer + .as_inner() + .as_inner() + .path + .metadata() + .unwrap(); + assert_eq!( + md.len(), + cap.into_u64(), + "buffered writer does one write if we write 1.5x buffer capacity" + ); + assert_eq!( + &file.buffered_writer.inspect_buffer()[0..cap / 2], + &content[cap..cap + cap / 2] + ); + } + + #[tokio::test] + async fn test_read_split_across_file_and_buffer() { + // This test exercises the logic on the read path that splits the logical read + // into a read from the flushed part (= the file) and a copy from the buffered writer's buffer. + // + // This test build on the assertions in test_flushes_do_happen + + let (conf, tenant_id, timeline_id, ctx) = + harness("test_read_split_across_file_and_buffer").unwrap(); + + let gate = utils::sync::gate::Gate::default(); + + let mut file = + EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx) + .await + .unwrap(); + + let cap = file.buffered_writer.inspect_buffer().capacity(); + + let content: Vec = rand::thread_rng() + .sample_iter(rand::distributions::Standard) + .take(cap + cap / 2) + .collect(); + + file.write_raw(&content, &ctx).await.unwrap(); + + let test_read = |start: usize, len: usize| { + let file = &file; + let ctx = &ctx; + let content = &content; + async move { + let (buf, nread) = file + .read_exact_at_eof_ok( + start.into_u64(), + Vec::with_capacity(len).slice_full(), + ctx, + ) + .await + .unwrap(); + assert_eq!(nread, len); + assert_eq!(&buf.into_inner(), &content[start..(start + len)]); + } + }; + + // completely within the file range + assert!(20 < cap, "test assumption"); + test_read(10, 10).await; + // border onto edge of file + test_read(cap - 10, 10).await; + // read across file and buffer + test_read(cap - 10, 20).await; + // stay from start of buffer + test_read(cap, 10).await; + // completely within buffer + test_read(cap + 10, 10).await; + } } diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs deleted file mode 100644 index 48926354f1..0000000000 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ /dev/null @@ -1,153 +0,0 @@ -//! Wrapper around [`super::zero_padded_read_write::RW`] that uses the -//! [`crate::page_cache`] to serve reads that need to go to the underlying [`VirtualFile`]. -//! -//! Subject to removal in - -use crate::context::RequestContext; -use crate::page_cache::{self, PAGE_SZ}; -use crate::tenant::block_io::BlockLease; -use crate::virtual_file::owned_buffers_io::util::size_tracking_writer; -use crate::virtual_file::VirtualFile; - -use std::io::{self}; -use tokio_epoll_uring::BoundedBuf; -use tracing::*; - -use super::zero_padded_read_write; - -/// See module-level comment. -pub struct RW { - page_cache_file_id: page_cache::FileId, - rw: super::zero_padded_read_write::RW>, - /// Gate guard is held on as long as we need to do operations in the path (delete on drop). - _gate_guard: utils::sync::gate::GateGuard, -} - -impl RW { - pub fn new(file: VirtualFile, _gate_guard: utils::sync::gate::GateGuard) -> Self { - let page_cache_file_id = page_cache::next_file_id(); - Self { - page_cache_file_id, - rw: super::zero_padded_read_write::RW::new(size_tracking_writer::Writer::new(file)), - _gate_guard, - } - } - - pub fn page_cache_file_id(&self) -> page_cache::FileId { - self.page_cache_file_id - } - - pub(crate) async fn write_all_borrowed( - &mut self, - srcbuf: &[u8], - ctx: &RequestContext, - ) -> Result { - // It doesn't make sense to proactively fill the page cache on the Pageserver write path - // because Compute is unlikely to access recently written data. - self.rw.write_all_borrowed(srcbuf, ctx).await - } - - pub(crate) fn bytes_written(&self) -> u64 { - self.rw.bytes_written() - } - - /// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer. - /// - /// This includes the blocks that aren't yet flushed to disk by the internal buffered writer. - /// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`]. - pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result, io::Error> { - // round up to the next PAGE_SZ multiple, required by blob_io - let size = { - let s = usize::try_from(self.bytes_written()).unwrap(); - if s % PAGE_SZ == 0 { - s - } else { - s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap() - } - }; - let vec = Vec::with_capacity(size); - - // read from disk what we've already flushed - let file_size_tracking_writer = self.rw.as_writer(); - let flushed_range = 0..usize::try_from(file_size_tracking_writer.bytes_written()).unwrap(); - let mut vec = file_size_tracking_writer - .as_inner() - .read_exact_at( - vec.slice(0..(flushed_range.end - flushed_range.start)), - u64::try_from(flushed_range.start).unwrap(), - ctx, - ) - .await? - .into_inner(); - - // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk - let buffered = self.rw.get_tail_zero_padded(); - vec.extend_from_slice(buffered); - assert_eq!(vec.len(), size); - assert_eq!(vec.len() % PAGE_SZ, 0); - Ok(vec) - } - - pub(crate) async fn read_blk( - &self, - blknum: u32, - ctx: &RequestContext, - ) -> Result { - match self.rw.read_blk(blknum).await? { - zero_padded_read_write::ReadResult::NeedsReadFromWriter { writer } => { - let cache = page_cache::get(); - match cache - .read_immutable_buf(self.page_cache_file_id, blknum, ctx) - .await - .map_err(|e| { - std::io::Error::new( - std::io::ErrorKind::Other, - // order path before error because error is anyhow::Error => might have many contexts - format!( - "ephemeral file: read immutable page #{}: {}: {:#}", - blknum, - self.rw.as_writer().as_inner().path, - e, - ), - ) - })? { - page_cache::ReadBufResult::Found(guard) => { - return Ok(BlockLease::PageReadGuard(guard)) - } - page_cache::ReadBufResult::NotFound(write_guard) => { - let write_guard = writer - .as_inner() - .read_exact_at_page(write_guard, blknum as u64 * PAGE_SZ as u64, ctx) - .await?; - let read_guard = write_guard.mark_valid(); - return Ok(BlockLease::PageReadGuard(read_guard)); - } - } - } - zero_padded_read_write::ReadResult::ServedFromZeroPaddedMutableTail { buffer } => { - Ok(BlockLease::EphemeralFileMutableTail(buffer)) - } - } - } -} - -impl Drop for RW { - fn drop(&mut self) { - // There might still be pages in the [`crate::page_cache`] for this file. - // We leave them there, [`crate::page_cache::PageCache::find_victim`] will evict them when needed. - - // unlink the file - // we are clear to do this, because we have entered a gate - let path = &self.rw.as_writer().as_inner().path; - let res = std::fs::remove_file(path); - if let Err(e) = res { - if e.kind() != std::io::ErrorKind::NotFound { - // just never log the not found errors, we cannot do anything for them; on detach - // the tenant directory is already gone. - // - // not found files might also be related to https://github.com/neondatabase/neon/issues/2442 - error!("could not remove ephemeral file '{path}': {e}"); - } - } - } -} diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs deleted file mode 100644 index fe310acab8..0000000000 --- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs +++ /dev/null @@ -1,145 +0,0 @@ -//! The heart of how [`super::EphemeralFile`] does its reads and writes. -//! -//! # Writes -//! -//! [`super::EphemeralFile`] writes small, borrowed buffers using [`RW::write_all_borrowed`]. -//! The [`RW`] batches these into [`TAIL_SZ`] bigger writes, using [`owned_buffers_io::write::BufferedWriter`]. -//! -//! # Reads -//! -//! [`super::EphemeralFile`] always reads full [`PAGE_SZ`]ed blocks using [`RW::read_blk`]. -//! -//! The [`RW`] serves these reads either from the buffered writer's in-memory buffer -//! or redirects the caller to read from the underlying [`OwnedAsyncWriter`] -//! if the read is for the prefix that has already been flushed. -//! -//! # Current Usage -//! -//! The current user of this module is [`super::page_caching::RW`]. - -mod zero_padded; - -use crate::{ - context::RequestContext, - page_cache::PAGE_SZ, - virtual_file::owned_buffers_io::{ - self, - write::{Buffer, OwnedAsyncWriter}, - }, -}; - -const TAIL_SZ: usize = 64 * 1024; - -/// See module-level comment. -pub struct RW { - buffered_writer: owned_buffers_io::write::BufferedWriter< - zero_padded::Buffer, - owned_buffers_io::util::size_tracking_writer::Writer, - >, -} - -pub enum ReadResult<'a, W> { - NeedsReadFromWriter { writer: &'a W }, - ServedFromZeroPaddedMutableTail { buffer: &'a [u8; PAGE_SZ] }, -} - -impl RW -where - W: OwnedAsyncWriter, -{ - pub fn new(writer: W) -> Self { - let bytes_flushed_tracker = - owned_buffers_io::util::size_tracking_writer::Writer::new(writer); - let buffered_writer = owned_buffers_io::write::BufferedWriter::new( - bytes_flushed_tracker, - zero_padded::Buffer::default(), - ); - Self { buffered_writer } - } - - pub(crate) fn as_writer(&self) -> &W { - self.buffered_writer.as_inner().as_inner() - } - - pub async fn write_all_borrowed( - &mut self, - buf: &[u8], - ctx: &RequestContext, - ) -> std::io::Result { - self.buffered_writer.write_buffered_borrowed(buf, ctx).await - } - - pub fn bytes_written(&self) -> u64 { - let flushed_offset = self.buffered_writer.as_inner().bytes_written(); - let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); - flushed_offset + u64::try_from(buffer.pending()).unwrap() - } - - /// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`]. - pub fn get_tail_zero_padded(&self) -> &[u8] { - let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); - let buffer_written_up_to = buffer.pending(); - // pad to next page boundary - let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 { - buffer_written_up_to - } else { - buffer_written_up_to - .checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ)) - .unwrap() - }; - &buffer.as_zero_padded_slice()[0..read_up_to] - } - - pub(crate) async fn read_blk(&self, blknum: u32) -> Result, std::io::Error> { - let flushed_offset = self.buffered_writer.as_inner().bytes_written(); - let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); - let buffered_offset = flushed_offset + u64::try_from(buffer.pending()).unwrap(); - let read_offset = (blknum as u64) * (PAGE_SZ as u64); - - // The trailing page ("block") might only be partially filled, - // yet the blob_io code relies on us to return a full PAGE_SZed slice anyway. - // Moreover, it has to be zero-padded, because when we still had - // a write-back page cache, it provided pre-zeroed pages, and blob_io came to rely on it. - // DeltaLayer probably has the same issue, not sure why it needs no special treatment. - // => check here that the read doesn't go beyond this potentially trailing - // => the zero-padding is done in the `else` branch below - let blocks_written = if buffered_offset % (PAGE_SZ as u64) == 0 { - buffered_offset / (PAGE_SZ as u64) - } else { - (buffered_offset / (PAGE_SZ as u64)) + 1 - }; - if (blknum as u64) >= blocks_written { - return Err(std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!("read past end of ephemeral_file: read=0x{read_offset:x} buffered=0x{buffered_offset:x} flushed=0x{flushed_offset}"))); - } - - // assertions for the `if-else` below - assert_eq!( - flushed_offset % (TAIL_SZ as u64), 0, - "we only use write_buffered_borrowed to write to the buffered writer, so it's guaranteed that flushes happen buffer.cap()-sized chunks" - ); - assert_eq!( - flushed_offset % (PAGE_SZ as u64), - 0, - "the logic below can't handle if the page is spread across the flushed part and the buffer" - ); - - if read_offset < flushed_offset { - assert!(read_offset + (PAGE_SZ as u64) <= flushed_offset); - Ok(ReadResult::NeedsReadFromWriter { - writer: self.as_writer(), - }) - } else { - let read_offset_in_buffer = read_offset - .checked_sub(flushed_offset) - .expect("would have taken `if` branch instead of this one"); - let read_offset_in_buffer = usize::try_from(read_offset_in_buffer).unwrap(); - let zero_padded_slice = buffer.as_zero_padded_slice(); - let page = &zero_padded_slice[read_offset_in_buffer..(read_offset_in_buffer + PAGE_SZ)]; - Ok(ReadResult::ServedFromZeroPaddedMutableTail { - buffer: page - .try_into() - .expect("the slice above got it as page-size slice"), - }) - } - } -} diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs deleted file mode 100644 index 2dc0277638..0000000000 --- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write/zero_padded.rs +++ /dev/null @@ -1,110 +0,0 @@ -//! A [`crate::virtual_file::owned_buffers_io::write::Buffer`] whose -//! unwritten range is guaranteed to be zero-initialized. -//! This is used by [`crate::tenant::ephemeral_file::zero_padded_read_write::RW::read_blk`] -//! to serve page-sized reads of the trailing page when the trailing page has only been partially filled. - -use std::mem::MaybeUninit; - -use crate::virtual_file::owned_buffers_io::io_buf_ext::FullSlice; - -/// See module-level comment. -pub struct Buffer { - allocation: Box<[u8; N]>, - written: usize, -} - -impl Default for Buffer { - fn default() -> Self { - Self { - allocation: Box::new( - // SAFETY: zeroed memory is a valid [u8; N] - unsafe { MaybeUninit::zeroed().assume_init() }, - ), - written: 0, - } - } -} - -impl Buffer { - #[inline(always)] - fn invariants(&self) { - // don't check by default, unoptimized is too expensive even for debug mode - if false { - debug_assert!(self.written <= N, "{}", self.written); - debug_assert!(self.allocation[self.written..N].iter().all(|v| *v == 0)); - } - } - - pub fn as_zero_padded_slice(&self) -> &[u8; N] { - &self.allocation - } -} - -impl crate::virtual_file::owned_buffers_io::write::Buffer for Buffer { - type IoBuf = Self; - - fn cap(&self) -> usize { - self.allocation.len() - } - - fn extend_from_slice(&mut self, other: &[u8]) { - self.invariants(); - let remaining = self.allocation.len() - self.written; - if other.len() > remaining { - panic!("calling extend_from_slice() with insufficient remaining capacity"); - } - self.allocation[self.written..(self.written + other.len())].copy_from_slice(other); - self.written += other.len(); - self.invariants(); - } - - fn pending(&self) -> usize { - self.written - } - - fn flush(self) -> FullSlice { - self.invariants(); - let written = self.written; - FullSlice::must_new(tokio_epoll_uring::BoundedBuf::slice(self, 0..written)) - } - - fn reuse_after_flush(iobuf: Self::IoBuf) -> Self { - let Self { - mut allocation, - written, - } = iobuf; - allocation[0..written].fill(0); - let new = Self { - allocation, - written: 0, - }; - new.invariants(); - new - } -} - -/// We have this trait impl so that the `flush` method in the `Buffer` impl above can produce a -/// [`tokio_epoll_uring::BoundedBuf::slice`] of the [`Self::written`] range of the data. -/// -/// Remember that bytes_init is generally _not_ a tracker of the amount -/// of valid data in the io buffer; we use `Slice` for that. -/// The `IoBuf` is _only_ for keeping track of uninitialized memory, a bit like MaybeUninit. -/// -/// SAFETY: -/// -/// The [`Self::allocation`] is stable becauses boxes are stable. -/// The memory is zero-initialized, so, bytes_init is always N. -unsafe impl tokio_epoll_uring::IoBuf for Buffer { - fn stable_ptr(&self) -> *const u8 { - self.allocation.as_ptr() - } - - fn bytes_init(&self) -> usize { - // Yes, N, not self.written; Read the full comment of this impl block! - N - } - - fn bytes_total(&self) -> usize { - N - } -} diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index f4a2957972..6a2cd94232 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -39,8 +39,8 @@ use crate::tenant::disk_btree::{ use crate::tenant::storage_layer::layer::S3_UPLOAD_LIMIT; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ - BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, - VectoredReadPlanner, + BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, + VectoredReadCoalesceMode, VectoredReadPlanner, }; use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; @@ -52,6 +52,7 @@ use bytes::BytesMut; use camino::{Utf8Path, Utf8PathBuf}; use futures::StreamExt; use itertools::Itertools; +use pageserver_api::config::MaxVectoredReadBytes; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::shard::TenantShardId; @@ -65,7 +66,7 @@ use std::os::unix::fs::FileExt; use std::str::FromStr; use std::sync::Arc; use tokio::sync::OnceCell; -use tokio_epoll_uring::IoBufMut; +use tokio_epoll_uring::IoBuf; use tracing::*; use utils::{ @@ -225,9 +226,7 @@ pub struct DeltaLayerInner { file: VirtualFile, file_id: FileId, - #[allow(dead_code)] layer_key_range: Range, - #[allow(dead_code)] layer_lsn_range: Range, max_vectored_read_bytes: Option, @@ -471,7 +470,7 @@ impl DeltaLayerWriterInner { ctx: &RequestContext, ) -> (FullSlice, anyhow::Result<()>) where - Buf: IoBufMut + Send, + Buf: IoBuf + Send, { assert!( self.lsn_range.start <= lsn, @@ -678,7 +677,7 @@ impl DeltaLayerWriter { ctx: &RequestContext, ) -> (FullSlice, anyhow::Result<()>) where - Buf: IoBufMut + Send, + Buf: IoBuf + Send, { self.inner .as_mut() @@ -882,44 +881,6 @@ impl DeltaLayerInner { Ok(()) } - /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future. - pub(super) async fn load_key_values( - &self, - ctx: &RequestContext, - ) -> anyhow::Result> { - let block_reader = FileBlockReader::new(&self.file, self.file_id); - let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( - self.index_start_blk, - self.index_root_blk, - block_reader, - ); - let mut result = Vec::new(); - let mut stream = - Box::pin(self.stream_index_forwards(index_reader, &[0; DELTA_KEY_SIZE], ctx)); - let block_reader = FileBlockReader::new(&self.file, self.file_id); - let cursor = block_reader.block_cursor(); - let mut buf = Vec::new(); - while let Some(item) = stream.next().await { - let (key, lsn, pos) = item?; - // TODO: dedup code with get_reconstruct_value - // TODO: ctx handling and sharding - cursor - .read_blob_into_buf(pos.pos(), &mut buf, ctx) - .await - .with_context(|| { - format!("Failed to read blob from virtual file {}", self.file.path) - })?; - let val = Value::des(&buf).with_context(|| { - format!( - "Failed to deserialize file blob from virtual file {}", - self.file.path - ) - })?; - result.push((key, lsn, val)); - } - Ok(result) - } - async fn plan_reads( keyspace: &KeySpace, lsn_range: Range, @@ -1205,6 +1166,7 @@ impl DeltaLayerInner { let mut prev: Option<(Key, Lsn, BlobRef)> = None; let mut read_builder: Option = None; + let read_mode = VectoredReadCoalesceMode::get(); let max_read_size = self .max_vectored_read_bytes @@ -1253,6 +1215,7 @@ impl DeltaLayerInner { offsets.end.pos(), meta, max_read_size, + read_mode, )) } } else { @@ -2295,7 +2258,7 @@ pub(crate) mod test { // every key should be a batch b/c the value is larger than max_read_size assert_eq!(iter.key_values_batch.len(), 1); } else { - assert_eq!(iter.key_values_batch.len(), batch_size); + assert!(iter.key_values_batch.len() <= batch_size); } if num_items >= N { break; diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 3cb2b1c83a..77ce1ae670 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -28,14 +28,13 @@ use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::page_cache::{self, FileId, PAGE_SZ}; use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::BlobWriter; -use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader}; +use crate::tenant::block_io::{BlockBuf, FileBlockReader}; use crate::tenant::disk_btree::{ DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection, }; use crate::tenant::timeline::GetVectoredError; use crate::tenant::vectored_blob_io::{ - BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, - VectoredReadPlanner, + BlobFlag, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead, VectoredReadPlanner, }; use crate::tenant::{PageReconstructError, Timeline}; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; @@ -46,6 +45,7 @@ use bytes::{Bytes, BytesMut}; use camino::{Utf8Path, Utf8PathBuf}; use hex; use itertools::Itertools; +use pageserver_api::config::MaxVectoredReadBytes; use pageserver_api::keyspace::KeySpace; use pageserver_api::shard::{ShardIdentity, TenantShardId}; use rand::{distributions::Alphanumeric, Rng}; @@ -453,33 +453,6 @@ impl ImageLayerInner { Ok(()) } - /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future. - pub(super) async fn load_key_values( - &self, - ctx: &RequestContext, - ) -> anyhow::Result> { - let block_reader = FileBlockReader::new(&self.file, self.file_id); - let tree_reader = - DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader); - let mut result = Vec::new(); - let mut stream = Box::pin(tree_reader.into_stream(&[0; KEY_SIZE], ctx)); - let block_reader = FileBlockReader::new(&self.file, self.file_id); - let cursor = block_reader.block_cursor(); - while let Some(item) = stream.next().await { - // TODO: dedup code with get_reconstruct_value - let (raw_key, offset) = item?; - let key = Key::from_slice(&raw_key[..KEY_SIZE]); - // TODO: ctx handling and sharding - let blob = cursor - .read_blob(offset, ctx) - .await - .with_context(|| format!("failed to read value from offset {}", offset))?; - let value = Bytes::from(blob); - result.push((key, self.lsn, Value::Image(value))); - } - Ok(result) - } - /// Traverse the layer's index to build read operations on the overlap of the input keyspace /// and the keys in this layer. /// @@ -711,7 +684,7 @@ struct ImageLayerWriterInner { blob_writer: BlobWriter, tree: DiskBtreeBuilder, - #[cfg_attr(not(feature = "testing"), allow(dead_code))] + #[cfg(feature = "testing")] last_written_key: Key, } @@ -770,6 +743,7 @@ impl ImageLayerWriterInner { uncompressed_bytes_eligible: 0, uncompressed_bytes_chosen: 0, num_keys: 0, + #[cfg(feature = "testing")] last_written_key: Key::MIN, }; @@ -1381,7 +1355,7 @@ mod test { // every key should be a batch b/c the value is larger than max_read_size assert_eq!(iter.key_values_batch.len(), 1); } else { - assert_eq!(iter.key_values_batch.len(), batch_size); + assert!(iter.key_values_batch.len() <= batch_size); } if num_items >= N { break; diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index a71b4dd83b..e487bee1f2 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -4,23 +4,23 @@ //! held in an ephemeral file, not in memory. The metadata for each page version, i.e. //! its position in the file, is kept in memory, though. //! +use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64}; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; -use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value}; -use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef}; use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::timeline::GetVectoredError; use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::{l0_flush, page_cache}; -use anyhow::{anyhow, Result}; +use anyhow::{anyhow, Context, Result}; +use bytes::Bytes; use camino::Utf8PathBuf; use pageserver_api::key::CompactKey; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; -use std::collections::BTreeMap; +use std::collections::{BTreeMap, HashMap}; use std::sync::{Arc, OnceLock}; use std::time::Instant; use tracing::*; @@ -39,6 +39,8 @@ use super::{ DeltaLayerWriter, PersistentLayerDesc, ValueReconstructSituation, ValuesReconstructState, }; +pub(crate) mod vectored_dio_read; + #[derive(Debug, PartialEq, Eq, Clone, Copy, Hash)] pub(crate) struct InMemoryLayerFileId(page_cache::FileId); @@ -78,9 +80,9 @@ impl std::fmt::Debug for InMemoryLayer { pub struct InMemoryLayerInner { /// All versions of all pages in the layer are kept here. Indexed - /// by block number and LSN. The value is an offset into the + /// by block number and LSN. The [`IndexEntry`] is an offset into the /// ephemeral file where the page version is stored. - index: BTreeMap>, + index: BTreeMap>, /// The values are stored in a serialized format in this file. /// Each serialized Value is preceded by a 'u32' length field. @@ -90,6 +92,154 @@ pub struct InMemoryLayerInner { resource_units: GlobalResourceUnits, } +/// Support the same max blob length as blob_io, because ultimately +/// all the InMemoryLayer contents end up being written into a delta layer, +/// using the [`crate::tenant::blob_io`]. +const MAX_SUPPORTED_BLOB_LEN: usize = crate::tenant::blob_io::MAX_SUPPORTED_BLOB_LEN; +const MAX_SUPPORTED_BLOB_LEN_BITS: usize = { + let trailing_ones = MAX_SUPPORTED_BLOB_LEN.trailing_ones() as usize; + let leading_zeroes = MAX_SUPPORTED_BLOB_LEN.leading_zeros() as usize; + assert!(trailing_ones + leading_zeroes == std::mem::size_of::() * 8); + trailing_ones +}; + +/// See [`InMemoryLayerInner::index`]. +/// +/// For memory efficiency, the data is packed into a u64. +/// +/// Layout: +/// - 1 bit: `will_init` +/// - [`MAX_SUPPORTED_BLOB_LEN_BITS`]: `len` +/// - [`MAX_SUPPORTED_POS_BITS`]: `pos` +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct IndexEntry(u64); + +impl IndexEntry { + /// See [`Self::MAX_SUPPORTED_POS`]. + const MAX_SUPPORTED_POS_BITS: usize = { + let remainder = 64 - 1 - MAX_SUPPORTED_BLOB_LEN_BITS; + if remainder < 32 { + panic!("pos can be u32 as per type system, support that"); + } + remainder + }; + /// The maximum supported blob offset that can be represented by [`Self`]. + /// See also [`Self::validate_checkpoint_distance`]. + const MAX_SUPPORTED_POS: usize = (1 << Self::MAX_SUPPORTED_POS_BITS) - 1; + + // Layout + const WILL_INIT_RANGE: Range = 0..1; + const LEN_RANGE: Range = + Self::WILL_INIT_RANGE.end..Self::WILL_INIT_RANGE.end + MAX_SUPPORTED_BLOB_LEN_BITS; + const POS_RANGE: Range = + Self::LEN_RANGE.end..Self::LEN_RANGE.end + Self::MAX_SUPPORTED_POS_BITS; + const _ASSERT: () = { + if Self::POS_RANGE.end != 64 { + panic!("we don't want undefined bits for our own sanity") + } + }; + + /// Fails if and only if the offset or length encoded in `arg` is too large to be represented by [`Self`]. + /// + /// The only reason why that can happen in the system is if the [`InMemoryLayer`] grows too long. + /// The [`InMemoryLayer`] size is determined by the checkpoint distance, enforced by [`crate::tenant::Timeline::should_roll`]. + /// + /// Thus, to avoid failure of this function, whenever we start up and/or change checkpoint distance, + /// call [`Self::validate_checkpoint_distance`] with the new checkpoint distance value. + /// + /// TODO: this check should happen ideally at config parsing time (and in the request handler when a change to checkpoint distance is requested) + /// When cleaning this up, also look into the s3 max file size check that is performed in delta layer writer. + #[inline(always)] + fn new(arg: IndexEntryNewArgs) -> anyhow::Result { + let IndexEntryNewArgs { + base_offset, + batch_offset, + len, + will_init, + } = arg; + + let pos = base_offset + .checked_add(batch_offset) + .ok_or_else(|| anyhow::anyhow!("base_offset + batch_offset overflows u64: base_offset={base_offset} batch_offset={batch_offset}"))?; + + if pos.into_usize() > Self::MAX_SUPPORTED_POS { + anyhow::bail!( + "base_offset+batch_offset exceeds the maximum supported value: base_offset={base_offset} batch_offset={batch_offset} (+)={pos} max={max}", + max = Self::MAX_SUPPORTED_POS + ); + } + + if len > MAX_SUPPORTED_BLOB_LEN { + anyhow::bail!( + "len exceeds the maximum supported length: len={len} max={MAX_SUPPORTED_BLOB_LEN}", + ); + } + + let mut data: u64 = 0; + use bit_field::BitField; + data.set_bits(Self::WILL_INIT_RANGE, if will_init { 1 } else { 0 }); + data.set_bits(Self::LEN_RANGE, len.into_u64()); + data.set_bits(Self::POS_RANGE, pos); + + Ok(Self(data)) + } + + #[inline(always)] + fn unpack(&self) -> IndexEntryUnpacked { + use bit_field::BitField; + IndexEntryUnpacked { + will_init: self.0.get_bits(Self::WILL_INIT_RANGE) != 0, + len: self.0.get_bits(Self::LEN_RANGE), + pos: self.0.get_bits(Self::POS_RANGE), + } + } + + /// See [`Self::new`]. + pub(crate) const fn validate_checkpoint_distance( + checkpoint_distance: u64, + ) -> Result<(), &'static str> { + if checkpoint_distance > Self::MAX_SUPPORTED_POS as u64 { + return Err("exceeds the maximum supported value"); + } + let res = u64_to_usize(checkpoint_distance).checked_add(MAX_SUPPORTED_BLOB_LEN); + if res.is_none() { + return Err( + "checkpoint distance + max supported blob len overflows in-memory addition", + ); + } + + // NB: it is ok for the result of the addition to be larger than MAX_SUPPORTED_POS + + Ok(()) + } + + const _ASSERT_DEFAULT_CHECKPOINT_DISTANCE_IS_VALID: () = { + let res = Self::validate_checkpoint_distance( + pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE, + ); + if res.is_err() { + panic!("default checkpoint distance is valid") + } + }; +} + +/// Args to [`IndexEntry::new`]. +#[derive(Clone, Copy)] +struct IndexEntryNewArgs { + base_offset: u64, + batch_offset: u64, + len: usize, + will_init: bool, +} + +/// Unpacked representation of the bitfielded [`IndexEntry`]. +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +struct IndexEntryUnpacked { + will_init: bool, + len: u64, + pos: u64, +} + impl std::fmt::Debug for InMemoryLayerInner { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("InMemoryLayerInner").finish() @@ -276,7 +426,12 @@ impl InMemoryLayer { .build(); let inner = self.inner.read().await; - let reader = inner.file.block_cursor(); + + struct ValueRead { + entry_lsn: Lsn, + read: vectored_dio_read::LogicalRead>, + } + let mut reads: HashMap> = HashMap::new(); for range in keyspace.ranges.iter() { for (key, vec_map) in inner @@ -291,24 +446,62 @@ impl InMemoryLayer { let slice = vec_map.slice_range(lsn_range); - for (entry_lsn, pos) in slice.iter().rev() { - // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183 - let buf = reader.read_blob(*pos, &ctx).await; - if let Err(e) = buf { - reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e))); + for (entry_lsn, index_entry) in slice.iter().rev() { + let IndexEntryUnpacked { + pos, + len, + will_init, + } = index_entry.unpack(); + reads.entry(key).or_default().push(ValueRead { + entry_lsn: *entry_lsn, + read: vectored_dio_read::LogicalRead::new( + pos, + Vec::with_capacity(len as usize), + ), + }); + if will_init { break; } + } + } + } - let value = Value::des(&buf.unwrap()); - if let Err(e) = value { + // Execute the reads. + + let f = vectored_dio_read::execute( + &inner.file, + reads + .iter() + .flat_map(|(_, value_reads)| value_reads.iter().map(|v| &v.read)), + &ctx, + ); + send_future::SendFuture::send(f) // https://github.com/rust-lang/rust/issues/96865 + .await; + + // Process results into the reconstruct state + 'next_key: for (key, value_reads) in reads { + for ValueRead { entry_lsn, read } in value_reads { + match read.into_result().expect("we run execute() above") { + Err(e) => { reconstruct_state.on_key_error(key, PageReconstructError::from(anyhow!(e))); - break; + continue 'next_key; } + Ok(value_buf) => { + let value = Value::des(&value_buf); + if let Err(e) = value { + reconstruct_state + .on_key_error(key, PageReconstructError::from(anyhow!(e))); + continue 'next_key; + } - let key_situation = - reconstruct_state.update_key(&key, *entry_lsn, value.unwrap()); - if key_situation == ValueReconstructSituation::Complete { - break; + let key_situation = + reconstruct_state.update_key(&key, entry_lsn, value.unwrap()); + if key_situation == ValueReconstructSituation::Complete { + // TODO: metric to see if we fetched more values than necessary + continue 'next_key; + } + + // process the next value in the next iteration of the loop } } } @@ -324,8 +517,9 @@ impl InMemoryLayer { struct SerializedBatchOffset { key: CompactKey, lsn: Lsn, - /// offset in bytes from the start of the batch's buffer to the Value's serialized size header. - offset: u64, + // TODO: separate type when we start serde-serializing this value, to avoid coupling + // in-memory representation to serialization format. + index_entry: IndexEntry, } pub struct SerializedBatch { @@ -340,30 +534,10 @@ pub struct SerializedBatch { } impl SerializedBatch { - /// Write a blob length in the internal format of the EphemeralFile - pub(crate) fn write_blob_length(len: usize, cursor: &mut std::io::Cursor>) { - use std::io::Write; - - if len < 0x80 { - // short one-byte length header - let len_buf = [len as u8]; - - cursor - .write_all(&len_buf) - .expect("Writing to Vec is infallible"); - } else { - let mut len_buf = u32::to_be_bytes(len as u32); - len_buf[0] |= 0x80; - cursor - .write_all(&len_buf) - .expect("Writing to Vec is infallible"); - } - } - - pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> Self { + pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> anyhow::Result { // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`] - let buffer_size = batch.iter().map(|i| i.2).sum::() + 4 * batch.len(); + let buffer_size = batch.iter().map(|i| i.2).sum::(); let mut cursor = std::io::Cursor::new(Vec::::with_capacity(buffer_size)); let mut offsets: Vec = Vec::with_capacity(batch.len()); @@ -371,14 +545,19 @@ impl SerializedBatch { for (key, lsn, val_ser_size, val) in batch { let relative_off = cursor.position(); - Self::write_blob_length(val_ser_size, &mut cursor); val.ser_into(&mut cursor) .expect("Writing into in-memory buffer is infallible"); offsets.push(SerializedBatchOffset { key, lsn, - offset: relative_off, + index_entry: IndexEntry::new(IndexEntryNewArgs { + base_offset: 0, + batch_offset: relative_off, + len: val_ser_size, + will_init: val.will_init(), + }) + .context("higher-level code ensures that values are within supported ranges")?, }); max_lsn = std::cmp::max(max_lsn, lsn); } @@ -388,11 +567,11 @@ impl SerializedBatch { // Assert that we didn't do any extra allocations while building buffer. debug_assert!(buffer.len() <= buffer_size); - Self { + Ok(Self { raw: buffer, offsets, max_lsn, - } + }) } } @@ -456,44 +635,74 @@ impl InMemoryLayer { }) } - // Write path. + /// Write path. + /// + /// Errors are not retryable, the [`InMemoryLayer`] must be discarded, and not be read from. + /// The reason why it's not retryable is that the [`EphemeralFile`] writes are not retryable. + /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors. pub async fn put_batch( &self, serialized_batch: SerializedBatch, ctx: &RequestContext, - ) -> Result<()> { + ) -> anyhow::Result<()> { let mut inner = self.inner.write().await; self.assert_writable(); - let base_off = { - inner - .file - .write_raw( - &serialized_batch.raw, - &RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::InMemoryLayer) - .build(), - ) - .await? - }; + let base_offset = inner.file.len(); + let SerializedBatch { + raw, + mut offsets, + max_lsn: _, + } = serialized_batch; + + // Add the base_offset to the batch's index entries which are relative to the batch start. + for offset in &mut offsets { + let IndexEntryUnpacked { + will_init, + len, + pos, + } = offset.index_entry.unpack(); + offset.index_entry = IndexEntry::new(IndexEntryNewArgs { + base_offset, + batch_offset: pos, + len: len.into_usize(), + will_init, + })?; + } + + // Write the batch to the file + inner.file.write_raw(&raw, ctx).await?; + let new_size = inner.file.len(); + let expected_new_len = base_offset + .checked_add(raw.len().into_u64()) + // write_raw would error if we were to overflow u64. + // also IndexEntry and higher levels in + //the code don't allow the file to grow that large + .unwrap(); + assert_eq!(new_size, expected_new_len); + + // Update the index with the new entries for SerializedBatchOffset { key, lsn, - offset: relative_off, - } in serialized_batch.offsets + index_entry, + } in offsets { - let off = base_off + relative_off; let vec_map = inner.index.entry(key).or_default(); - let old = vec_map.append_or_update_last(lsn, off).unwrap().0; + let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0; if old.is_some() { - // We already had an entry for this LSN. That's odd.. - warn!("Key {} at {} already exists", key, lsn); + // This should not break anything, but is unexpected: ingestion code aims to filter out + // multiple writes to the same key at the same LSN. This happens in cases where our + // ingenstion code generates some write like an empty page, and we see a write from postgres + // to the same key in the same wal record. If one such write makes it through, we + // index the most recent write, implicitly ignoring the earlier write. We log a warning + // because this case is unexpected, and we would like tests to fail if this happens. + warn!("Key {} at {} written twice at same LSN", key, lsn); } } - let size = inner.file.len(); - inner.resource_units.maybe_publish_size(size); + inner.resource_units.maybe_publish_size(new_size); Ok(()) } @@ -537,7 +746,7 @@ impl InMemoryLayer { { let inner = self.inner.write().await; for vec_map in inner.index.values() { - for (lsn, _pos) in vec_map.as_slice() { + for (lsn, _) in vec_map.as_slice() { assert!(*lsn < end_lsn); } } @@ -601,36 +810,23 @@ impl InMemoryLayer { match l0_flush_global_state { l0_flush::Inner::Direct { .. } => { let file_contents: Vec = inner.file.load_to_vec(ctx).await?; - assert_eq!( - file_contents.len() % PAGE_SZ, - 0, - "needed by BlockReaderRef::Slice" - ); - assert_eq!(file_contents.len(), { - let written = usize::try_from(inner.file.len()).unwrap(); - if written % PAGE_SZ == 0 { - written - } else { - written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap() - } - }); - let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents)); - - let mut buf = Vec::new(); + let file_contents = Bytes::from(file_contents); for (key, vec_map) in inner.index.iter() { // Write all page versions - for (lsn, pos) in vec_map.as_slice() { - // TODO: once we have blob lengths in the in-memory index, we can - // 1. get rid of the blob_io / BlockReaderRef::Slice business and - // 2. load the file contents into a Bytes and - // 3. the use `Bytes::slice` to get the `buf` that is our blob - // 4. pass that `buf` into `put_value_bytes` - // => https://github.com/neondatabase/neon/issues/8183 - cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?; - let will_init = Value::des(&buf)?.will_init(); - let (tmp, res) = delta_layer_writer + for (lsn, entry) in vec_map + .as_slice() + .iter() + .map(|(lsn, entry)| (lsn, entry.unpack())) + { + let IndexEntryUnpacked { + pos, + len, + will_init, + } = entry; + let buf = Bytes::slice(&file_contents, pos as usize..(pos + len) as usize); + let (_buf, res) = delta_layer_writer .put_value_bytes( Key::from_compact(*key), *lsn, @@ -640,7 +836,6 @@ impl InMemoryLayer { ) .await; res?; - buf = tmp.into_raw_slice().into_inner(); } } } @@ -662,3 +857,134 @@ impl InMemoryLayer { Ok(Some((desc, path))) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_index_entry() { + const MAX_SUPPORTED_POS: usize = IndexEntry::MAX_SUPPORTED_POS; + use IndexEntryNewArgs as Args; + use IndexEntryUnpacked as Unpacked; + + let roundtrip = |args, expect: Unpacked| { + let res = IndexEntry::new(args).expect("this tests expects no errors"); + let IndexEntryUnpacked { + will_init, + len, + pos, + } = res.unpack(); + assert_eq!(will_init, expect.will_init); + assert_eq!(len, expect.len); + assert_eq!(pos, expect.pos); + }; + + // basic roundtrip + for pos in [0, MAX_SUPPORTED_POS] { + for len in [0, MAX_SUPPORTED_BLOB_LEN] { + for will_init in [true, false] { + let expect = Unpacked { + will_init, + len: len.into_u64(), + pos: pos.into_u64(), + }; + roundtrip( + Args { + will_init, + base_offset: pos.into_u64(), + batch_offset: 0, + len, + }, + expect, + ); + roundtrip( + Args { + will_init, + base_offset: 0, + batch_offset: pos.into_u64(), + len, + }, + expect, + ); + } + } + } + + // too-large len + let too_large = Args { + will_init: false, + len: MAX_SUPPORTED_BLOB_LEN + 1, + base_offset: 0, + batch_offset: 0, + }; + assert!(IndexEntry::new(too_large).is_err()); + + // too-large pos + { + let too_large = Args { + will_init: false, + len: 0, + base_offset: MAX_SUPPORTED_POS.into_u64() + 1, + batch_offset: 0, + }; + assert!(IndexEntry::new(too_large).is_err()); + let too_large = Args { + will_init: false, + len: 0, + base_offset: 0, + batch_offset: MAX_SUPPORTED_POS.into_u64() + 1, + }; + assert!(IndexEntry::new(too_large).is_err()); + } + + // too large (base_offset + batch_offset) + { + let too_large = Args { + will_init: false, + len: 0, + base_offset: MAX_SUPPORTED_POS.into_u64(), + batch_offset: 1, + }; + assert!(IndexEntry::new(too_large).is_err()); + let too_large = Args { + will_init: false, + len: 0, + base_offset: MAX_SUPPORTED_POS.into_u64() - 1, + batch_offset: MAX_SUPPORTED_POS.into_u64() - 1, + }; + assert!(IndexEntry::new(too_large).is_err()); + } + + // valid special cases + // - area past the max supported pos that is accessible by len + for len in [1, MAX_SUPPORTED_BLOB_LEN] { + roundtrip( + Args { + will_init: false, + len, + base_offset: MAX_SUPPORTED_POS.into_u64(), + batch_offset: 0, + }, + Unpacked { + will_init: false, + len: len as u64, + pos: MAX_SUPPORTED_POS.into_u64(), + }, + ); + roundtrip( + Args { + will_init: false, + len, + base_offset: 0, + batch_offset: MAX_SUPPORTED_POS.into_u64(), + }, + Unpacked { + will_init: false, + len: len as u64, + pos: MAX_SUPPORTED_POS.into_u64(), + }, + ); + } + } +} diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs new file mode 100644 index 0000000000..0683e15659 --- /dev/null +++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs @@ -0,0 +1,937 @@ +use std::{ + collections::BTreeMap, + sync::{Arc, RwLock}, +}; + +use itertools::Itertools; +use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice}; + +use crate::{ + assert_u64_eq_usize::{U64IsUsize, UsizeIsU64}, + context::RequestContext, +}; + +/// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`]. +pub trait File: Send { + /// Attempt to read the bytes in `self` in range `[start,start+dst.bytes_total())` + /// and return the number of bytes read (let's call it `nread`). + /// The bytes read are placed in `dst`, i.e., `&dst[..nread]` will contain the read bytes. + /// + /// The only reason why the read may be short (i.e., `nread != dst.bytes_total()`) + /// is if the file is shorter than `start+dst.len()`. + /// + /// This is unlike [`std::os::unix::fs::FileExt::read_exact_at`] which returns an + /// [`std::io::ErrorKind::UnexpectedEof`] error if the file is shorter than `start+dst.len()`. + /// + /// No guarantees are made about the remaining bytes in `dst` in case of a short read. + async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( + &'b self, + start: u64, + dst: Slice, + ctx: &'a RequestContext, + ) -> std::io::Result<(Slice, usize)>; +} + +/// A logical read from [`File`]. See [`Self::new`]. +pub struct LogicalRead { + pos: u64, + state: RwLockRefCell>, +} + +enum LogicalReadState { + NotStarted(B), + Ongoing(B), + Ok(B), + Error(Arc), + Undefined, +} + +impl LogicalRead { + /// Create a new [`LogicalRead`] from [`File`] of the data in the file in range `[ pos, pos + buf.cap() )`. + pub fn new(pos: u64, buf: B) -> Self { + Self { + pos, + state: RwLockRefCell::new(LogicalReadState::NotStarted(buf)), + } + } + pub fn into_result(self) -> Option>> { + match self.state.into_inner() { + LogicalReadState::Ok(buf) => Some(Ok(buf)), + LogicalReadState::Error(e) => Some(Err(e)), + LogicalReadState::NotStarted(_) | LogicalReadState::Ongoing(_) => None, + LogicalReadState::Undefined => unreachable!(), + } + } +} + +/// The buffer into which a [`LogicalRead`] result is placed. +pub trait Buffer: std::ops::Deref { + /// Immutable. + fn cap(&self) -> usize; + /// Changes only through [`Self::extend_from_slice`]. + fn len(&self) -> usize; + /// Panics if the total length would exceed the initialized capacity. + fn extend_from_slice(&mut self, src: &[u8]); +} + +/// The minimum alignment and size requirement for disk offsets and memory buffer size for direct IO. +const DIO_CHUNK_SIZE: usize = 512; + +/// If multiple chunks need to be read, merge adjacent chunk reads into batches of max size `MAX_CHUNK_BATCH_SIZE`. +/// (The unit is the number of chunks.) +const MAX_CHUNK_BATCH_SIZE: usize = { + let desired = 128 * 1024; // 128k + if desired % DIO_CHUNK_SIZE != 0 { + panic!("MAX_CHUNK_BATCH_SIZE must be a multiple of DIO_CHUNK_SIZE") + // compile-time error + } + desired / DIO_CHUNK_SIZE +}; + +/// Execute the given logical `reads` against `file`. +/// The results are placed in the buffers of the [`LogicalRead`]s. +/// Retrieve the results by calling [`LogicalRead::into_result`] on each [`LogicalRead`]. +/// +/// The [`LogicalRead`]s must be freshly created using [`LogicalRead::new`] when calling this function. +/// Otherwise, this function panics. +pub async fn execute<'a, I, F, B>(file: &F, reads: I, ctx: &RequestContext) +where + I: IntoIterator>, + F: File, + B: Buffer + IoBufMut + Send, +{ + // Terminology: + // logical read = a request to read an arbitrary range of bytes from `file`; byte-level granularity + // chunk = we conceptually divide up the byte range of `file` into DIO_CHUNK_SIZEs ranges + // interest = a range within a chunk that a logical read is interested in; one logical read gets turned into many interests + // physical read = the read request we're going to issue to the OS; covers a range of chunks; chunk-level granularity + + // Preserve a copy of the logical reads for debug assertions at the end + #[cfg(debug_assertions)] + let (reads, assert_logical_reads) = { + let (reads, assert) = reads.into_iter().tee(); + (reads, Some(Vec::from_iter(assert))) + }; + #[cfg(not(debug_assertions))] + let (reads, assert_logical_reads): (_, Option>>) = (reads, None); + + // Plan which parts of which chunks need to be appended to which buffer + let mut by_chunk: BTreeMap>> = BTreeMap::new(); + struct Interest<'a, B: Buffer> { + logical_read: &'a LogicalRead, + offset_in_chunk: u64, + len: u64, + } + for logical_read in reads { + let LogicalRead { pos, state } = logical_read; + let mut state = state.borrow_mut(); + + // transition from NotStarted to Ongoing + let cur = std::mem::replace(&mut *state, LogicalReadState::Undefined); + let req_len = match cur { + LogicalReadState::NotStarted(buf) => { + if buf.len() != 0 { + panic!("The `LogicalRead`s that are passed in must be freshly created using `LogicalRead::new`"); + } + // buf.cap() == 0 is ok + + // transition into Ongoing state + let req_len = buf.cap(); + *state = LogicalReadState::Ongoing(buf); + req_len + } + x => panic!("must only call with fresh LogicalReads, got another state, leaving Undefined state behind state={x:?}"), + }; + + // plan which chunks we need to read from + let mut remaining = req_len; + let mut chunk_no = *pos / (DIO_CHUNK_SIZE.into_u64()); + let mut offset_in_chunk = pos.into_usize() % DIO_CHUNK_SIZE; + while remaining > 0 { + let remaining_in_chunk = std::cmp::min(remaining, DIO_CHUNK_SIZE - offset_in_chunk); + by_chunk.entry(chunk_no).or_default().push(Interest { + logical_read, + offset_in_chunk: offset_in_chunk.into_u64(), + len: remaining_in_chunk.into_u64(), + }); + offset_in_chunk = 0; + chunk_no += 1; + remaining -= remaining_in_chunk; + } + } + + // At this point, we could iterate over by_chunk, in chunk order, + // read each chunk from disk, and fill the buffers. + // However, we can merge adjacent chunks into batches of MAX_CHUNK_BATCH_SIZE + // so we issue fewer IOs = fewer roundtrips = lower overall latency. + struct PhysicalRead<'a, B: Buffer> { + start_chunk_no: u64, + nchunks: usize, + dsts: Vec>, + } + struct PhysicalInterest<'a, B: Buffer> { + logical_read: &'a LogicalRead, + offset_in_physical_read: u64, + len: u64, + } + let mut physical_reads: Vec> = Vec::new(); + let mut by_chunk = by_chunk.into_iter().peekable(); + loop { + let mut last_chunk_no = None; + let to_merge: Vec<(u64, Vec>)> = by_chunk + .peeking_take_while(|(chunk_no, _)| { + if let Some(last_chunk_no) = last_chunk_no { + if *chunk_no != last_chunk_no + 1 { + return false; + } + } + last_chunk_no = Some(*chunk_no); + true + }) + .take(MAX_CHUNK_BATCH_SIZE) + .collect(); // TODO: avoid this .collect() + let Some(start_chunk_no) = to_merge.first().map(|(chunk_no, _)| *chunk_no) else { + break; + }; + let nchunks = to_merge.len(); + let dsts = to_merge + .into_iter() + .enumerate() + .flat_map(|(i, (_, dsts))| { + dsts.into_iter().map( + move |Interest { + logical_read, + offset_in_chunk, + len, + }| { + PhysicalInterest { + logical_read, + offset_in_physical_read: i + .checked_mul(DIO_CHUNK_SIZE) + .unwrap() + .into_u64() + + offset_in_chunk, + len, + } + }, + ) + }) + .collect(); + physical_reads.push(PhysicalRead { + start_chunk_no, + nchunks, + dsts, + }); + } + drop(by_chunk); + + // Execute physical reads and fill the logical read buffers + // TODO: pipelined reads; prefetch; + let get_io_buffer = |nchunks| Vec::with_capacity(nchunks * DIO_CHUNK_SIZE); + for PhysicalRead { + start_chunk_no, + nchunks, + dsts, + } in physical_reads + { + let all_done = dsts + .iter() + .all(|PhysicalInterest { logical_read, .. }| logical_read.state.borrow().is_terminal()); + if all_done { + continue; + } + let read_offset = start_chunk_no + .checked_mul(DIO_CHUNK_SIZE.into_u64()) + .expect("we produce chunk_nos by dividing by DIO_CHUNK_SIZE earlier"); + let io_buf = get_io_buffer(nchunks).slice_full(); + let req_len = io_buf.len(); + let (io_buf_slice, nread) = match file.read_exact_at_eof_ok(read_offset, io_buf, ctx).await + { + Ok(t) => t, + Err(e) => { + let e = Arc::new(e); + for PhysicalInterest { logical_read, .. } in dsts { + *logical_read.state.borrow_mut() = LogicalReadState::Error(Arc::clone(&e)); + // this will make later reads for the given LogicalRead short-circuit, see top of loop body + } + continue; + } + }; + let io_buf = io_buf_slice.into_inner(); + assert!( + nread <= io_buf.len(), + "the last chunk in the file can be a short read, so, no ==" + ); + let io_buf = &io_buf[..nread]; + for PhysicalInterest { + logical_read, + offset_in_physical_read, + len, + } in dsts + { + let mut logical_read_state_borrow = logical_read.state.borrow_mut(); + let logical_read_buf = match &mut *logical_read_state_borrow { + LogicalReadState::NotStarted(_) => { + unreachable!("we transition it into Ongoing at function entry") + } + LogicalReadState::Ongoing(buf) => buf, + LogicalReadState::Ok(_) | LogicalReadState::Error(_) => { + continue; + } + LogicalReadState::Undefined => unreachable!(), + }; + let range_in_io_buf = std::ops::Range { + start: offset_in_physical_read as usize, + end: offset_in_physical_read as usize + len as usize, + }; + assert!(range_in_io_buf.end >= range_in_io_buf.start); + if range_in_io_buf.end > nread { + let msg = format!( + "physical read returned EOF where this logical read expected more data in the file: offset=0x{read_offset:x} req_len=0x{req_len:x} nread=0x{nread:x} {:?}", + &*logical_read_state_borrow + ); + logical_read_state_borrow.transition_to_terminal(Err(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + msg, + ))); + continue; + } + let data = &io_buf[range_in_io_buf]; + + // Copy data from io buffer into the logical read buffer. + // (And in debug mode, validate that the buffer impl adheres to the Buffer trait spec.) + let pre = if cfg!(debug_assertions) { + Some((logical_read_buf.len(), logical_read_buf.cap())) + } else { + None + }; + logical_read_buf.extend_from_slice(data); + let post = if cfg!(debug_assertions) { + Some((logical_read_buf.len(), logical_read_buf.cap())) + } else { + None + }; + match (pre, post) { + (None, None) => {} + (Some(_), None) | (None, Some(_)) => unreachable!(), + (Some((pre_len, pre_cap)), Some((post_len, post_cap))) => { + assert_eq!(pre_len + len as usize, post_len); + assert_eq!(pre_cap, post_cap); + } + } + + if logical_read_buf.len() == logical_read_buf.cap() { + logical_read_state_borrow.transition_to_terminal(Ok(())); + } + } + } + + if let Some(assert_logical_reads) = assert_logical_reads { + for logical_read in assert_logical_reads { + assert!(logical_read.state.borrow().is_terminal()); + } + } +} + +impl LogicalReadState { + fn is_terminal(&self) -> bool { + match self { + LogicalReadState::NotStarted(_) | LogicalReadState::Ongoing(_) => false, + LogicalReadState::Ok(_) | LogicalReadState::Error(_) => true, + LogicalReadState::Undefined => unreachable!(), + } + } + fn transition_to_terminal(&mut self, err: std::io::Result<()>) { + let cur = std::mem::replace(self, LogicalReadState::Undefined); + let buf = match cur { + LogicalReadState::Ongoing(buf) => buf, + x => panic!("must only call in state Ongoing, got {x:?}"), + }; + *self = match err { + Ok(()) => LogicalReadState::Ok(buf), + Err(e) => LogicalReadState::Error(Arc::new(e)), + }; + } +} + +impl std::fmt::Debug for LogicalReadState { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + #[derive(Debug)] + #[allow(unused)] + struct BufferDebug { + len: usize, + cap: usize, + } + impl<'a> From<&'a dyn Buffer> for BufferDebug { + fn from(buf: &'a dyn Buffer) -> Self { + Self { + len: buf.len(), + cap: buf.cap(), + } + } + } + match self { + LogicalReadState::NotStarted(b) => { + write!(f, "NotStarted({:?})", BufferDebug::from(b as &dyn Buffer)) + } + LogicalReadState::Ongoing(b) => { + write!(f, "Ongoing({:?})", BufferDebug::from(b as &dyn Buffer)) + } + LogicalReadState::Ok(b) => write!(f, "Ok({:?})", BufferDebug::from(b as &dyn Buffer)), + LogicalReadState::Error(e) => write!(f, "Error({:?})", e), + LogicalReadState::Undefined => write!(f, "Undefined"), + } + } +} + +#[derive(Debug)] +struct RwLockRefCell(RwLock); +impl RwLockRefCell { + fn new(value: T) -> Self { + Self(RwLock::new(value)) + } + fn borrow(&self) -> impl std::ops::Deref + '_ { + self.0.try_read().unwrap() + } + fn borrow_mut(&self) -> impl std::ops::DerefMut + '_ { + self.0.try_write().unwrap() + } + fn into_inner(self) -> T { + self.0.into_inner().unwrap() + } +} + +impl Buffer for Vec { + fn cap(&self) -> usize { + self.capacity() + } + + fn len(&self) -> usize { + self.len() + } + + fn extend_from_slice(&mut self, src: &[u8]) { + if self.len() + src.len() > self.cap() { + panic!("Buffer capacity exceeded"); + } + Vec::extend_from_slice(self, src); + } +} + +#[cfg(test)] +#[allow(clippy::assertions_on_constants)] +mod tests { + use rand::Rng; + + use crate::{ + context::DownloadBehavior, task_mgr::TaskKind, + virtual_file::owned_buffers_io::slice::SliceMutExt, + }; + + use super::*; + use std::{cell::RefCell, collections::VecDeque}; + + struct InMemoryFile { + content: Vec, + } + + impl InMemoryFile { + fn new_random(len: usize) -> Self { + Self { + content: rand::thread_rng() + .sample_iter(rand::distributions::Standard) + .take(len) + .collect(), + } + } + fn test_logical_read(&self, pos: u64, len: usize) -> TestLogicalRead { + let expected_result = if pos as usize + len > self.content.len() { + Err("InMemoryFile short read".to_string()) + } else { + Ok(self.content[pos as usize..pos as usize + len].to_vec()) + }; + TestLogicalRead::new(pos, len, expected_result) + } + } + + #[test] + fn test_in_memory_file() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let file = InMemoryFile::new_random(10); + let test_read = |pos, len| { + let buf = vec![0; len]; + let fut = file.read_exact_at_eof_ok(pos, buf.slice_full(), &ctx); + use futures::FutureExt; + let (slice, nread) = fut + .now_or_never() + .expect("impl never awaits") + .expect("impl never errors"); + let mut buf = slice.into_inner(); + buf.truncate(nread); + buf + }; + assert_eq!(test_read(0, 1), &file.content[0..1]); + assert_eq!(test_read(1, 2), &file.content[1..3]); + assert_eq!(test_read(9, 2), &file.content[9..]); + assert!(test_read(10, 2).is_empty()); + assert!(test_read(11, 2).is_empty()); + } + + impl File for InMemoryFile { + async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( + &'b self, + start: u64, + mut dst: Slice, + _ctx: &'a RequestContext, + ) -> std::io::Result<(Slice, usize)> { + let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed(); + let nread = { + let req_len = dst_slice.len(); + let len = std::cmp::min(req_len, self.content.len().saturating_sub(start as usize)); + if start as usize >= self.content.len() { + 0 + } else { + dst_slice[..len] + .copy_from_slice(&self.content[start as usize..start as usize + len]); + len + } + }; + rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[nread..]); // to discover bugs + Ok((dst, nread)) + } + } + + #[derive(Clone)] + struct TestLogicalRead { + pos: u64, + len: usize, + expected_result: Result, String>, + } + + impl TestLogicalRead { + fn new(pos: u64, len: usize, expected_result: Result, String>) -> Self { + Self { + pos, + len, + expected_result, + } + } + fn make_logical_read(&self) -> LogicalRead> { + LogicalRead::new(self.pos, Vec::with_capacity(self.len)) + } + } + + async fn execute_and_validate_test_logical_reads( + file: &F, + test_logical_reads: I, + ctx: &RequestContext, + ) where + I: IntoIterator, + F: File, + { + let (tmp, test_logical_reads) = test_logical_reads.into_iter().tee(); + let logical_reads = tmp.map(|tr| tr.make_logical_read()).collect::>(); + execute(file, logical_reads.iter(), ctx).await; + for (logical_read, test_logical_read) in logical_reads.into_iter().zip(test_logical_reads) { + let actual = logical_read.into_result().expect("we call execute()"); + match (actual, test_logical_read.expected_result) { + (Ok(actual), Ok(expected)) if actual == expected => {} + (Err(actual), Err(expected)) => { + assert_eq!(actual.to_string(), expected); + } + (actual, expected) => panic!("expected {expected:?}\nactual {actual:?}"), + } + } + } + + #[tokio::test] + async fn test_blackbox() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let cs = DIO_CHUNK_SIZE; + let cs_u64 = cs.into_u64(); + + let file = InMemoryFile::new_random(10 * cs); + + let test_logical_reads = vec![ + file.test_logical_read(0, 1), + // adjacent to logical_read0 + file.test_logical_read(1, 2), + // gap + // spans adjacent chunks + file.test_logical_read(cs_u64 - 1, 2), + // gap + // tail of chunk 3, all of chunk 4, and 2 bytes of chunk 5 + file.test_logical_read(3 * cs_u64 - 1, cs + 2), + // gap + file.test_logical_read(5 * cs_u64, 1), + ]; + let num_test_logical_reads = test_logical_reads.len(); + let test_logical_reads_perms = test_logical_reads + .into_iter() + .permutations(num_test_logical_reads); + + // test all orderings of LogicalReads, the order shouldn't matter for the results + for test_logical_reads in test_logical_reads_perms { + execute_and_validate_test_logical_reads(&file, test_logical_reads, &ctx).await; + } + } + + #[tokio::test] + #[should_panic] + async fn test_reusing_logical_reads_panics() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + let file = InMemoryFile::new_random(DIO_CHUNK_SIZE); + let a = file.test_logical_read(23, 10); + let logical_reads = vec![a.make_logical_read()]; + execute(&file, &logical_reads, &ctx).await; + // reuse pancis + execute(&file, &logical_reads, &ctx).await; + } + + struct RecorderFile<'a> { + recorded: RefCell>, + file: &'a InMemoryFile, + } + + struct RecordedRead { + pos: u64, + req_len: usize, + res: Vec, + } + + impl<'a> RecorderFile<'a> { + fn new(file: &'a InMemoryFile) -> RecorderFile<'a> { + Self { + recorded: Default::default(), + file, + } + } + } + + impl<'x> File for RecorderFile<'x> { + async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( + &'b self, + start: u64, + dst: Slice, + ctx: &'a RequestContext, + ) -> std::io::Result<(Slice, usize)> { + let (dst, nread) = self.file.read_exact_at_eof_ok(start, dst, ctx).await?; + self.recorded.borrow_mut().push(RecordedRead { + pos: start, + req_len: dst.bytes_total(), + res: Vec::from(&dst[..nread]), + }); + Ok((dst, nread)) + } + } + + #[tokio::test] + async fn test_logical_reads_to_same_chunk_are_merged_into_one_chunk_read() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + let file = InMemoryFile::new_random(2 * DIO_CHUNK_SIZE); + + let a = file.test_logical_read(DIO_CHUNK_SIZE.into_u64(), 10); + let b = file.test_logical_read(DIO_CHUNK_SIZE.into_u64() + 30, 20); + + let recorder = RecorderFile::new(&file); + + execute_and_validate_test_logical_reads(&recorder, vec![a, b], &ctx).await; + + let recorded = recorder.recorded.borrow(); + assert_eq!(recorded.len(), 1); + let RecordedRead { pos, req_len, .. } = &recorded[0]; + assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64()); + assert_eq!(*req_len, DIO_CHUNK_SIZE); + } + + #[tokio::test] + async fn test_max_chunk_batch_size_is_respected() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + let file = InMemoryFile::new_random(4 * MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE); + + // read the 10th byte of each chunk 3 .. 3+2*MAX_CHUNK_BATCH_SIZE + assert!(3 < MAX_CHUNK_BATCH_SIZE, "test assumption"); + assert!(10 < DIO_CHUNK_SIZE, "test assumption"); + let mut test_logical_reads = Vec::new(); + for i in 3..3 + MAX_CHUNK_BATCH_SIZE + MAX_CHUNK_BATCH_SIZE / 2 { + test_logical_reads + .push(file.test_logical_read(i.into_u64() * DIO_CHUNK_SIZE.into_u64() + 10, 1)); + } + + let recorder = RecorderFile::new(&file); + + execute_and_validate_test_logical_reads(&recorder, test_logical_reads, &ctx).await; + + let recorded = recorder.recorded.borrow(); + assert_eq!(recorded.len(), 2); + { + let RecordedRead { pos, req_len, .. } = &recorded[0]; + assert_eq!(*pos as usize, 3 * DIO_CHUNK_SIZE); + assert_eq!(*req_len, MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE); + } + { + let RecordedRead { pos, req_len, .. } = &recorded[1]; + assert_eq!(*pos as usize, (3 + MAX_CHUNK_BATCH_SIZE) * DIO_CHUNK_SIZE); + assert_eq!(*req_len, MAX_CHUNK_BATCH_SIZE / 2 * DIO_CHUNK_SIZE); + } + } + + #[tokio::test] + async fn test_batch_breaks_if_chunk_is_not_interesting() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + assert!(MAX_CHUNK_BATCH_SIZE > 10, "test assumption"); + let file = InMemoryFile::new_random(3 * DIO_CHUNK_SIZE); + + let a = file.test_logical_read(0, 1); // chunk 0 + let b = file.test_logical_read(2 * DIO_CHUNK_SIZE.into_u64(), 1); // chunk 2 + + let recorder = RecorderFile::new(&file); + + execute_and_validate_test_logical_reads(&recorder, vec![a, b], &ctx).await; + + let recorded = recorder.recorded.borrow(); + + assert_eq!(recorded.len(), 2); + { + let RecordedRead { pos, req_len, .. } = &recorded[0]; + assert_eq!(*pos, 0); + assert_eq!(*req_len, DIO_CHUNK_SIZE); + } + { + let RecordedRead { pos, req_len, .. } = &recorded[1]; + assert_eq!(*pos, 2 * DIO_CHUNK_SIZE.into_u64()); + assert_eq!(*req_len, DIO_CHUNK_SIZE); + } + } + + struct ExpectedRead { + expect_pos: u64, + expect_len: usize, + respond: Result, String>, + } + + struct MockFile { + expected: RefCell>, + } + + impl Drop for MockFile { + fn drop(&mut self) { + assert!( + self.expected.borrow().is_empty(), + "expected reads not satisfied" + ); + } + } + + macro_rules! mock_file { + ($($pos:expr , $len:expr => $respond:expr),* $(,)?) => {{ + MockFile { + expected: RefCell::new(VecDeque::from(vec![$(ExpectedRead { + expect_pos: $pos, + expect_len: $len, + respond: $respond, + }),*])), + } + }}; + } + + impl File for MockFile { + async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>( + &'b self, + start: u64, + mut dst: Slice, + _ctx: &'a RequestContext, + ) -> std::io::Result<(Slice, usize)> { + let ExpectedRead { + expect_pos, + expect_len, + respond, + } = self + .expected + .borrow_mut() + .pop_front() + .expect("unexpected read"); + assert_eq!(start, expect_pos); + assert_eq!(dst.bytes_total(), expect_len); + match respond { + Ok(mocked_bytes) => { + let len = std::cmp::min(dst.bytes_total(), mocked_bytes.len()); + let dst_slice: &mut [u8] = dst.as_mut_rust_slice_full_zeroed(); + dst_slice[..len].copy_from_slice(&mocked_bytes[..len]); + rand::Rng::fill(&mut rand::thread_rng(), &mut dst_slice[len..]); // to discover bugs + Ok((dst, len)) + } + Err(e) => Err(std::io::Error::new(std::io::ErrorKind::Other, e)), + } + } + } + + #[tokio::test] + async fn test_mock_file() { + // Self-test to ensure the relevant features of mock file work as expected. + + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + let mock_file = mock_file! { + 0 , 512 => Ok(vec![0; 512]), + 512 , 512 => Ok(vec![1; 512]), + 1024 , 512 => Ok(vec![2; 10]), + 2048, 1024 => Err("foo".to_owned()), + }; + + let buf = Vec::with_capacity(512); + let (buf, nread) = mock_file + .read_exact_at_eof_ok(0, buf.slice_full(), &ctx) + .await + .unwrap(); + assert_eq!(nread, 512); + assert_eq!(&buf.into_inner()[..nread], &[0; 512]); + + let buf = Vec::with_capacity(512); + let (buf, nread) = mock_file + .read_exact_at_eof_ok(512, buf.slice_full(), &ctx) + .await + .unwrap(); + assert_eq!(nread, 512); + assert_eq!(&buf.into_inner()[..nread], &[1; 512]); + + let buf = Vec::with_capacity(512); + let (buf, nread) = mock_file + .read_exact_at_eof_ok(1024, buf.slice_full(), &ctx) + .await + .unwrap(); + assert_eq!(nread, 10); + assert_eq!(&buf.into_inner()[..nread], &[2; 10]); + + let buf = Vec::with_capacity(1024); + let err = mock_file + .read_exact_at_eof_ok(2048, buf.slice_full(), &ctx) + .await + .err() + .unwrap(); + assert_eq!(err.to_string(), "foo"); + } + + #[tokio::test] + async fn test_error_on_one_chunk_read_fails_only_dependent_logical_reads() { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + + let test_logical_reads = vec![ + // read spanning two batches + TestLogicalRead::new( + DIO_CHUNK_SIZE.into_u64() / 2, + MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE, + Err("foo".to_owned()), + ), + // second read in failing chunk + TestLogicalRead::new( + (MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE).into_u64() + DIO_CHUNK_SIZE.into_u64() - 10, + 5, + Err("foo".to_owned()), + ), + // read unaffected + TestLogicalRead::new( + (MAX_CHUNK_BATCH_SIZE * DIO_CHUNK_SIZE).into_u64() + + 2 * DIO_CHUNK_SIZE.into_u64() + + 10, + 5, + Ok(vec![1; 5]), + ), + ]; + let (tmp, test_logical_reads) = test_logical_reads.into_iter().tee(); + let test_logical_read_perms = tmp.permutations(test_logical_reads.len()); + + for test_logical_reads in test_logical_read_perms { + let file = mock_file!( + 0, MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE => Ok(vec![0; MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE]), + (MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE).into_u64(), DIO_CHUNK_SIZE => Err("foo".to_owned()), + (MAX_CHUNK_BATCH_SIZE*DIO_CHUNK_SIZE + 2*DIO_CHUNK_SIZE).into_u64(), DIO_CHUNK_SIZE => Ok(vec![1; DIO_CHUNK_SIZE]), + ); + execute_and_validate_test_logical_reads(&file, test_logical_reads, &ctx).await; + } + } + + struct TestShortReadsSetup { + ctx: RequestContext, + file: InMemoryFile, + written: u64, + } + fn setup_short_chunk_read_tests() -> TestShortReadsSetup { + let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); + assert!(DIO_CHUNK_SIZE > 20, "test assumption"); + let written = (2 * DIO_CHUNK_SIZE - 10).into_u64(); + let file = InMemoryFile::new_random(written as usize); + TestShortReadsSetup { ctx, file, written } + } + + #[tokio::test] + async fn test_short_chunk_read_from_written_range() { + // Test what happens if there are logical reads + // that start within the last chunk, and + // the last chunk is not the full chunk length. + // + // The read should succeed despite the short chunk length. + let TestShortReadsSetup { ctx, file, written } = setup_short_chunk_read_tests(); + + let a = file.test_logical_read(written - 10, 5); + let recorder = RecorderFile::new(&file); + + execute_and_validate_test_logical_reads(&recorder, vec![a], &ctx).await; + + let recorded = recorder.recorded.borrow(); + assert_eq!(recorded.len(), 1); + let RecordedRead { pos, req_len, res } = &recorded[0]; + assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64()); + assert_eq!(*req_len, DIO_CHUNK_SIZE); + assert_eq!(res, &file.content[DIO_CHUNK_SIZE..(written as usize)]); + } + + #[tokio::test] + async fn test_short_chunk_read_and_logical_read_from_unwritten_range() { + // Test what happens if there are logical reads + // that start within the last chunk, and + // the last chunk is not the full chunk length, and + // the logical reads end in the unwritten range. + // + // All should fail with UnexpectedEof and have the same IO pattern. + async fn the_impl(offset_delta: i64) { + let TestShortReadsSetup { ctx, file, written } = setup_short_chunk_read_tests(); + + let offset = u64::try_from( + i64::try_from(written) + .unwrap() + .checked_add(offset_delta) + .unwrap(), + ) + .unwrap(); + let a = file.test_logical_read(offset, 5); + let recorder = RecorderFile::new(&file); + let a_vr = a.make_logical_read(); + execute(&recorder, vec![&a_vr], &ctx).await; + + // validate the LogicalRead result + let a_res = a_vr.into_result().unwrap(); + let a_err = a_res.unwrap_err(); + assert_eq!(a_err.kind(), std::io::ErrorKind::UnexpectedEof); + + // validate the IO pattern + let recorded = recorder.recorded.borrow(); + assert_eq!(recorded.len(), 1); + let RecordedRead { pos, req_len, res } = &recorded[0]; + assert_eq!(*pos, DIO_CHUNK_SIZE.into_u64()); + assert_eq!(*req_len, DIO_CHUNK_SIZE); + assert_eq!(res, &file.content[DIO_CHUNK_SIZE..(written as usize)]); + } + + the_impl(-1).await; // start == length - 1 + the_impl(0).await; // start == length + the_impl(1).await; // start == length + 1 + } + + // TODO: mixed: some valid, some UnexpectedEof + + // TODO: same tests but with merges +} diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 53bb66b95e..b15cd4da39 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -13,8 +13,7 @@ use utils::lsn::Lsn; use utils::sync::{gate, heavier_once_cell}; use crate::config::PageServerConf; -use crate::context::{DownloadBehavior, RequestContext}; -use crate::repository::Key; +use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder}; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::task_mgr::TaskKind; use crate::tenant::timeline::{CompactionError, GetVectoredError}; @@ -334,23 +333,6 @@ impl Layer { }) } - /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future. - #[allow(dead_code)] - pub(crate) async fn load_key_values( - &self, - ctx: &RequestContext, - ) -> anyhow::Result> { - let layer = self - .0 - .get_or_maybe_download(true, Some(ctx)) - .await - .map_err(|err| match err { - DownloadError::DownloadCancelled => GetVectoredError::Cancelled, - other => GetVectoredError::Other(anyhow::anyhow!(other)), - })?; - layer.load_key_values(&self.0, ctx).await - } - /// Download the layer if evicted. /// /// Will not error when the layer is already downloaded. @@ -1494,8 +1476,9 @@ impl LayerInner { let duration = SystemTime::now().duration_since(local_layer_mtime); match duration { Ok(elapsed) => { - let accessed = self.access_stats.accessed(); - if accessed { + let accessed_and_visible = self.access_stats.accessed() + && self.access_stats.visibility() == LayerVisibilityHint::Visible; + if accessed_and_visible { // Only layers used for reads contribute to our "low residence" metric that is used // to detect thrashing. Layers promoted for other reasons (e.g. compaction) are allowed // to be rapidly evicted without contributing to this metric. @@ -1509,7 +1492,7 @@ impl LayerInner { tracing::info!( residence_millis = elapsed.as_millis(), - accessed, + accessed_and_visible, "evicted layer after known residence period" ); } @@ -1695,6 +1678,9 @@ impl DownloadedLayer { ); let res = if owner.desc.is_delta { + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(crate::context::PageContentKind::DeltaLayerSummary) + .build(); let summary = Some(delta_layer::Summary::expected( owner.desc.tenant_shard_id.tenant_id, owner.desc.timeline_id, @@ -1705,11 +1691,14 @@ impl DownloadedLayer { &owner.path, summary, Some(owner.conf.max_vectored_read_bytes), - ctx, + &ctx, ) .await .map(LayerKind::Delta) } else { + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(crate::context::PageContentKind::ImageLayerSummary) + .build(); let lsn = owner.desc.image_layer_lsn(); let summary = Some(image_layer::Summary::expected( owner.desc.tenant_shard_id.tenant_id, @@ -1722,7 +1711,7 @@ impl DownloadedLayer { lsn, summary, Some(owner.conf.max_vectored_read_bytes), - ctx, + &ctx, ) .await .map(LayerKind::Image) @@ -1776,19 +1765,6 @@ impl DownloadedLayer { } } - async fn load_key_values( - &self, - owner: &Arc, - ctx: &RequestContext, - ) -> anyhow::Result> { - use LayerKind::*; - - match self.get(owner, ctx).await? { - Delta(d) => d.load_key_values(ctx).await, - Image(i) => i.load_key_values(ctx).await, - } - } - async fn dump(&self, owner: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { use LayerKind::*; match self.get(owner, ctx).await? { diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index bffd2db800..0b9bde4f57 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -782,7 +782,7 @@ async fn eviction_cancellation_on_drop() { let mut writer = timeline.writer().await; writer .put( - Key::from_i128(5), + crate::repository::Key::from_i128(5), Lsn(0x20), &Value::Image(Bytes::from_static(b"this does not matter either")), &ctx, diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs index df910b5ad9..e8deb0a1e5 100644 --- a/pageserver/src/tenant/storage_layer/split_writer.rs +++ b/pageserver/src/tenant/storage_layer/split_writer.rs @@ -353,7 +353,7 @@ impl SplitDeltaLayerWriter { Ok(generated_layers) } - #[allow(dead_code)] + #[cfg(test)] pub(crate) async fn finish( self, tline: &Arc, diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 12f080f3c1..478e9bb4f0 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -10,7 +10,6 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::metrics::TENANT_TASK_EVENTS; use crate::task_mgr; use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME}; -use crate::tenant::config::defaults::DEFAULT_COMPACTION_PERIOD; use crate::tenant::throttle::Stats; use crate::tenant::timeline::CompactionError; use crate::tenant::{Tenant, TenantState}; @@ -192,20 +191,28 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { } } - let started_at = Instant::now(); - let sleep_duration = if period == Duration::ZERO { + + let sleep_duration; + if period == Duration::ZERO { #[cfg(not(feature = "testing"))] info!("automatic compaction is disabled"); // check again in 10 seconds, in case it's been enabled again. - Duration::from_secs(10) + sleep_duration = Duration::from_secs(10) } else { + let iteration = Iteration { + started_at: Instant::now(), + period, + kind: BackgroundLoopKind::Compaction, + }; + // Run compaction - match tenant.compaction_iteration(&cancel, &ctx).await { + let IterationResult { output, elapsed } = iteration.run(tenant.compaction_iteration(&cancel, &ctx)).await; + match output { Ok(has_pending_task) => { error_run_count = 0; // schedule the next compaction immediately in case there is a pending compaction task - if has_pending_task { Duration::ZERO } else { period } + sleep_duration = if has_pending_task { Duration::ZERO } else { period }; } Err(e) => { let wait_duration = backoff::exponential_backoff_duration_seconds( @@ -221,16 +228,14 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { &wait_duration, cancel.is_cancelled(), ); - wait_duration + sleep_duration = wait_duration; } } + + // the duration is recorded by performance tests by enabling debug in this function + tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete"); }; - let elapsed = started_at.elapsed(); - warn_when_period_overrun(elapsed, period, BackgroundLoopKind::Compaction); - - // the duration is recorded by performance tests by enabling debug in this function - tracing::debug!(elapsed_ms=elapsed.as_millis(), "compaction iteration complete"); // Perhaps we did no work and the walredo process has been idle for some time: // give it a chance to shut down to avoid leaving walredo process running indefinitely. @@ -368,23 +373,27 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { } } - let started_at = Instant::now(); - let gc_horizon = tenant.get_gc_horizon(); - let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 { + let sleep_duration; + if period == Duration::ZERO || gc_horizon == 0 { #[cfg(not(feature = "testing"))] info!("automatic GC is disabled"); // check again in 10 seconds, in case it's been enabled again. - Duration::from_secs(10) + sleep_duration = Duration::from_secs(10); } else { + let iteration = Iteration { + started_at: Instant::now(), + period, + kind: BackgroundLoopKind::Gc, + }; // Run gc - let res = tenant - .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx) + let IterationResult { output, elapsed: _ } = + iteration.run(tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)) .await; - match res { + match output { Ok(_) => { error_run_count = 0; - period + sleep_duration = period; } Err(crate::tenant::GcError::TenantCancelled) => { return; @@ -408,13 +417,11 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { error!("Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}"); } - wait_duration + sleep_duration = wait_duration; } } }; - warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc); - if tokio::time::timeout(sleep_duration, cancel.cancelled()) .await .is_ok() @@ -448,9 +455,11 @@ async fn ingest_housekeeping_loop(tenant: Arc, cancel: CancellationToken // If compaction period is set to zero (to disable it), then we will use a reasonable default let period = if period == Duration::ZERO { - humantime::Duration::from_str(DEFAULT_COMPACTION_PERIOD) - .unwrap() - .into() + humantime::Duration::from_str( + pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD, + ) + .unwrap() + .into() } else { period }; @@ -468,14 +477,12 @@ async fn ingest_housekeeping_loop(tenant: Arc, cancel: CancellationToken break; } - let started_at = Instant::now(); - tenant.ingest_housekeeping().await; - - warn_when_period_overrun( - started_at.elapsed(), + let iteration = Iteration { + started_at: Instant::now(), period, - BackgroundLoopKind::IngestHouseKeeping, - ); + kind: BackgroundLoopKind::IngestHouseKeeping, + }; + iteration.run(tenant.ingest_housekeeping()).await; } } .await; @@ -553,6 +560,54 @@ pub(crate) async fn delay_by_lease_length( } } +struct Iteration { + started_at: Instant, + period: Duration, + kind: BackgroundLoopKind, +} + +struct IterationResult { + output: O, + elapsed: Duration, +} + +impl Iteration { + #[instrument(skip_all)] + pub(crate) async fn run(self, fut: Fut) -> IterationResult + where + Fut: std::future::Future, + { + let Self { + started_at, + period, + kind, + } = self; + + let mut fut = std::pin::pin!(fut); + + // Wrap `fut` into a future that logs a message every `period` so that we get a + // very obvious breadcrumb in the logs _while_ a slow iteration is happening. + let liveness_logger = async move { + loop { + match tokio::time::timeout(period, &mut fut).await { + Ok(x) => return x, + Err(_) => { + // info level as per the same rationale why warn_when_period_overrun is info + // => https://github.com/neondatabase/neon/pull/5724 + info!("still running"); + } + } + } + }; + + let output = liveness_logger.await; + + let elapsed = started_at.elapsed(); + warn_when_period_overrun(elapsed, period, kind); + + IterationResult { output, elapsed } + } +} /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric. pub(crate) fn warn_when_period_overrun( elapsed: Duration, diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs index f3f3d5e3ae..f222e708e1 100644 --- a/pageserver/src/tenant/throttle.rs +++ b/pageserver/src/tenant/throttle.rs @@ -10,6 +10,7 @@ use std::{ use arc_swap::ArcSwap; use enumset::EnumSet; use tracing::{error, warn}; +use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter}; use crate::{context::RequestContext, task_mgr::TaskKind}; @@ -33,8 +34,7 @@ pub struct Throttle { pub struct Inner { task_kinds: EnumSet, - rate_limiter: Arc, - config: Config, + rate_limiter: Arc, } pub type Config = pageserver_api::models::ThrottleConfig; @@ -77,8 +77,7 @@ where refill_interval, refill_amount, max, - fair, - } = &config; + } = config; let task_kinds: EnumSet = task_kinds .iter() .filter_map(|s| match TaskKind::from_str(s) { @@ -93,18 +92,21 @@ where } }) .collect(); + + // steady rate, we expect `refill_amount` requests per `refill_interval`. + // dividing gives us the rps. + let rps = f64::from(refill_amount.get()) / refill_interval.as_secs_f64(); + let config = LeakyBucketConfig::new(rps, f64::from(max)); + + // initial tracks how many tokens are available to put in the bucket + // we want how many tokens are currently in the bucket + let initial_tokens = max - initial; + + let rate_limiter = RateLimiter::with_initial_tokens(config, f64::from(initial_tokens)); + Inner { task_kinds, - rate_limiter: Arc::new( - leaky_bucket::RateLimiter::builder() - .initial(*initial) - .interval(*refill_interval) - .refill(refill_amount.get()) - .max(*max) - .fair(*fair) - .build(), - ), - config, + rate_limiter: Arc::new(rate_limiter), } } pub fn reconfigure(&self, config: Config) { @@ -127,7 +129,7 @@ where /// See [`Config::steady_rps`]. pub fn steady_rps(&self) -> f64 { - self.inner.load().config.steady_rps() + self.inner.load().rate_limiter.steady_rps() } pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option { @@ -136,18 +138,9 @@ where return None; }; let start = std::time::Instant::now(); - let mut did_throttle = false; - let acquire = inner.rate_limiter.acquire(key_count); - // turn off runtime-induced preemption (aka coop) so our `did_throttle` is accurate - let acquire = tokio::task::unconstrained(acquire); - let mut acquire = std::pin::pin!(acquire); - std::future::poll_fn(|cx| { - use std::future::Future; - let poll = acquire.as_mut().poll(cx); - did_throttle = did_throttle || poll.is_pending(); - poll - }) - .await; + + let did_throttle = inner.rate_limiter.acquire(key_count).await; + self.count_accounted.fetch_add(1, Ordering::Relaxed); if did_throttle { self.count_throttled.fetch_add(1, Ordering::Relaxed); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 098c196ee8..262dccac7d 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -66,10 +66,9 @@ use std::{ use crate::{ aux_file::AuxFileSizeEstimator, tenant::{ - config::defaults::DEFAULT_PITR_INTERVAL, layer_map::{LayerMap, SearchResult}, metadata::TimelineMetadata, - storage_layer::PersistentLayerDesc, + storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc}, }, walredo, }; @@ -102,6 +101,7 @@ use crate::{ pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, virtual_file::{MaybeFatalIo, VirtualFile}, }; +use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL; use crate::config::PageServerConf; use crate::keyspace::{KeyPartitioning, KeySpace}; @@ -218,7 +218,7 @@ pub(crate) struct RelSizeCache { } pub struct Timeline { - conf: &'static PageServerConf, + pub(crate) conf: &'static PageServerConf, tenant_conf: Arc>, myself: Weak, @@ -867,6 +867,11 @@ impl Timeline { .map(|ancestor| ancestor.timeline_id) } + /// Get the ancestor timeline + pub(crate) fn ancestor_timeline(&self) -> Option<&Arc> { + self.ancestor_timeline.as_ref() + } + /// Get the bytes written since the PITR cutoff on this branch, and /// whether this branch's ancestor_lsn is within its parent's PITR. pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) { @@ -1907,6 +1912,8 @@ impl Timeline { true } else if projected_layer_size >= checkpoint_distance { + // NB: this check is relied upon by: + let _ = IndexEntry::validate_checkpoint_distance; info!( "Will roll layer at {} with layer size {} due to layer size ({})", projected_lsn, layer_size, projected_layer_size @@ -2236,7 +2243,7 @@ impl Timeline { }; if aux_file_policy == Some(AuxFilePolicy::V1) { - warn!("this timeline is using deprecated aux file policy V1"); + warn!("this timeline is using deprecated aux file policy V1 (when loading the timeline)"); } result.repartition_threshold = @@ -4530,7 +4537,6 @@ pub struct DeltaLayerTestDesc { #[cfg(test)] impl DeltaLayerTestDesc { - #[allow(dead_code)] pub fn new(lsn_range: Range, key_range: Range, data: Vec<(Key, Lsn, Value)>) -> Self { Self { lsn_range, @@ -5702,7 +5708,7 @@ impl<'a> TimelineWriter<'a> { return Ok(()); } - let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch); + let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch)?; let batch_max_lsn = serialized_batch.max_lsn; let buf_size: u64 = serialized_batch.raw.len() as u64; @@ -5739,6 +5745,12 @@ impl<'a> TimelineWriter<'a> { ctx: &RequestContext, ) -> anyhow::Result<()> { use utils::bin_ser::BeSer; + if !key.is_valid_key_on_write_path() { + bail!( + "the request contains data not supported by pageserver at TimelineWriter::put: {}", + key + ); + } let val_ser_size = value.serialized_size().unwrap() as usize; self.put_batch( vec![(key.to_compact(), lsn, val_ser_size, value.clone())], diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index aad75ac59c..6b9c8386f7 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -19,6 +19,7 @@ use bytes::Bytes; use enumset::EnumSet; use fail::fail_point; use itertools::Itertools; +use pageserver_api::config::{CompactL0BypassPageCacheValidation, CompactL0Phase1ValueAccess}; use pageserver_api::key::KEY_SIZE; use pageserver_api::keyspace::ShardedRange; use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId}; @@ -29,7 +30,6 @@ use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; -use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD}; use crate::tenant::remote_timeline_client::WaitCompletionError; use crate::tenant::storage_layer::merge_iterator::MergeIterator; use crate::tenant::storage_layer::split_writer::{ @@ -43,6 +43,9 @@ use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter}; use crate::tenant::timeline::{Layer, ResidentLayer}; use crate::tenant::DeltaLayer; use crate::virtual_file::{MaybeFatalIo, VirtualFile}; +use pageserver_api::config::tenant_conf_defaults::{ + DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD, +}; use crate::keyspace::KeySpace; use crate::repository::{Key, Value}; @@ -1433,43 +1436,6 @@ impl TryFrom for CompactLevel0Phase1Stats { } } -#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] -#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] -pub enum CompactL0Phase1ValueAccess { - /// The old way. - PageCachedBlobIo, - /// The new way. - StreamingKmerge { - /// If set, we run both the old way and the new way, validate that - /// they are identical (=> [`CompactL0BypassPageCacheValidation`]), - /// and if the validation fails, - /// - in tests: fail them with a panic or - /// - in prod, log a rate-limited warning and use the old way's results. - /// - /// If not set, we only run the new way and trust its results. - validate: Option, - }, -} - -/// See [`CompactL0Phase1ValueAccess::StreamingKmerge`]. -#[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize)] -#[serde(rename_all = "kebab-case")] -pub enum CompactL0BypassPageCacheValidation { - /// Validate that the series of (key, lsn) pairs are the same. - KeyLsn, - /// Validate that the entire output of old and new way is identical. - KeyLsnValue, -} - -impl Default for CompactL0Phase1ValueAccess { - fn default() -> Self { - CompactL0Phase1ValueAccess::StreamingKmerge { - // TODO(https://github.com/neondatabase/neon/issues/8184): change to None once confident - validate: Some(CompactL0BypassPageCacheValidation::KeyLsnValue), - } - } -} - impl Timeline { /// Entry point for new tiered compaction algorithm. /// diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index 0114473eda..cee259e2e0 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -31,7 +31,7 @@ use crate::{ task_mgr::{TaskKind, WALRECEIVER_RUNTIME}, tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo}, walingest::WalIngest, - walrecord::DecodedWALRecord, + walrecord::{decode_wal_record, DecodedWALRecord}, }; use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; @@ -312,10 +312,25 @@ pub(super) async fn handle_walreceiver_connection( waldecoder.feed_bytes(data); { - let mut decoded = DecodedWALRecord::default(); let mut modification = timeline.begin_modification(startlsn); let mut uncommitted_records = 0; let mut filtered_records = 0; + + async fn commit( + modification: &mut DatadirModification<'_>, + uncommitted: &mut u64, + filtered: &mut u64, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + WAL_INGEST + .records_committed + .inc_by(*uncommitted - *filtered); + modification.commit(ctx).await?; + *uncommitted = 0; + *filtered = 0; + Ok(()) + } + while let Some((lsn, recdata)) = waldecoder.poll_decode()? { // It is important to deal with the aligned records as lsn in getPage@LSN is // aligned and can be several bytes bigger. Without this alignment we are @@ -324,9 +339,28 @@ pub(super) async fn handle_walreceiver_connection( return Err(WalReceiverError::Other(anyhow!("LSN not aligned"))); } + // Deserialize WAL record + let mut decoded = DecodedWALRecord::default(); + decode_wal_record(recdata, &mut decoded, modification.tline.pg_version)?; + + if decoded.is_dbase_create_copy(timeline.pg_version) + && uncommitted_records > 0 + { + // Special case: legacy PG database creations operate by reading pages from a 'template' database: + // these are the only kinds of WAL record that require reading data blocks while ingesting. Ensure + // all earlier writes of data blocks are visible by committing any modification in flight. + commit( + &mut modification, + &mut uncommitted_records, + &mut filtered_records, + &ctx, + ) + .await?; + } + // Ingest the records without immediately committing them. let ingested = walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx) + .ingest_record(decoded, lsn, &mut modification, &ctx) .await .with_context(|| format!("could not ingest record at {lsn}"))?; if !ingested { @@ -349,21 +383,25 @@ pub(super) async fn handle_walreceiver_connection( || modification.approx_pending_bytes() > DatadirModification::MAX_PENDING_BYTES { - WAL_INGEST - .records_committed - .inc_by(uncommitted_records - filtered_records); - modification.commit(&ctx).await?; - uncommitted_records = 0; - filtered_records = 0; + commit( + &mut modification, + &mut uncommitted_records, + &mut filtered_records, + &ctx, + ) + .await?; } } // Commit the remaining records. if uncommitted_records > 0 { - WAL_INGEST - .records_committed - .inc_by(uncommitted_records - filtered_records); - modification.commit(&ctx).await?; + commit( + &mut modification, + &mut uncommitted_records, + &mut filtered_records, + &ctx, + ) + .await?; } } diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 54a3ad789b..4d51dc442d 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -16,7 +16,6 @@ //! Note that the vectored blob api does *not* go through the page cache. use std::collections::BTreeMap; -use std::num::NonZeroUsize; use bytes::BytesMut; use pageserver_api::key::Key; @@ -27,10 +26,7 @@ use utils::vec_map::VecMap; use crate::context::RequestContext; use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK}; -use crate::virtual_file::VirtualFile; - -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub struct MaxVectoredReadBytes(pub NonZeroUsize); +use crate::virtual_file::{self, VirtualFile}; /// Metadata bundled with the start and end offset of a blob. #[derive(Copy, Clone, Debug)] @@ -60,7 +56,7 @@ pub struct VectoredBlobsBuf { pub struct VectoredRead { pub start: u64, pub end: u64, - /// Starting offsets and metadata for each blob in this read + /// Start offset and metadata for each blob in this read pub blobs_at: VecMap, } @@ -76,14 +72,109 @@ pub(crate) enum VectoredReadExtended { No, } -pub(crate) struct VectoredReadBuilder { +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +pub enum VectoredReadCoalesceMode { + /// Only coalesce exactly adjacent reads. + AdjacentOnly, + /// In addition to adjacent reads, also consider reads whose corresponding + /// `end` and `start` offsets reside at the same chunk. + Chunked(usize), +} + +impl VectoredReadCoalesceMode { + /// [`AdjacentVectoredReadBuilder`] is used if alignment requirement is 0, + /// whereas [`ChunkedVectoredReadBuilder`] is used for alignment requirement 1 and higher. + pub(crate) fn get() -> Self { + let align = virtual_file::get_io_buffer_alignment_raw(); + if align == 0 { + VectoredReadCoalesceMode::AdjacentOnly + } else { + VectoredReadCoalesceMode::Chunked(align) + } + } +} + +pub(crate) enum VectoredReadBuilder { + Adjacent(AdjacentVectoredReadBuilder), + Chunked(ChunkedVectoredReadBuilder), +} + +impl VectoredReadBuilder { + fn new_impl( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + max_read_size: Option, + mode: VectoredReadCoalesceMode, + ) -> Self { + match mode { + VectoredReadCoalesceMode::AdjacentOnly => Self::Adjacent( + AdjacentVectoredReadBuilder::new(start_offset, end_offset, meta, max_read_size), + ), + VectoredReadCoalesceMode::Chunked(chunk_size) => { + Self::Chunked(ChunkedVectoredReadBuilder::new( + start_offset, + end_offset, + meta, + max_read_size, + chunk_size, + )) + } + } + } + + pub(crate) fn new( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + max_read_size: usize, + mode: VectoredReadCoalesceMode, + ) -> Self { + Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), mode) + } + + pub(crate) fn new_streaming( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + mode: VectoredReadCoalesceMode, + ) -> Self { + Self::new_impl(start_offset, end_offset, meta, None, mode) + } + + pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { + match self { + VectoredReadBuilder::Adjacent(builder) => builder.extend(start, end, meta), + VectoredReadBuilder::Chunked(builder) => builder.extend(start, end, meta), + } + } + + pub(crate) fn build(self) -> VectoredRead { + match self { + VectoredReadBuilder::Adjacent(builder) => builder.build(), + VectoredReadBuilder::Chunked(builder) => builder.build(), + } + } + + pub(crate) fn size(&self) -> usize { + match self { + VectoredReadBuilder::Adjacent(builder) => builder.size(), + VectoredReadBuilder::Chunked(builder) => builder.size(), + } + } +} + +pub(crate) struct AdjacentVectoredReadBuilder { + /// Start offset of the read. start: u64, + // End offset of the read. end: u64, + /// Start offset and metadata for each blob in this read blobs_at: VecMap, max_read_size: Option, } -impl VectoredReadBuilder { +impl AdjacentVectoredReadBuilder { /// Start building a new vectored read. /// /// Note that by design, this does not check against reading more than `max_read_size` to @@ -93,7 +184,7 @@ impl VectoredReadBuilder { start_offset: u64, end_offset: u64, meta: BlobMeta, - max_read_size: usize, + max_read_size: Option, ) -> Self { let mut blobs_at = VecMap::default(); blobs_at @@ -104,7 +195,7 @@ impl VectoredReadBuilder { start: start_offset, end: end_offset, blobs_at, - max_read_size: Some(max_read_size), + max_read_size, } } /// Attempt to extend the current read with a new blob if the start @@ -113,13 +204,15 @@ impl VectoredReadBuilder { pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { tracing::trace!(start, end, "trying to extend"); let size = (end - start) as usize; - if self.end == start && { + let not_limited_by_max_read_size = { if let Some(max_read_size) = self.max_read_size { self.size() + size <= max_read_size } else { true } - } { + }; + + if self.end == start && not_limited_by_max_read_size { self.end = end; self.blobs_at .append(start, meta) @@ -144,6 +237,107 @@ impl VectoredReadBuilder { } } +pub(crate) struct ChunkedVectoredReadBuilder { + /// Start block number + start_blk_no: usize, + /// End block number (exclusive). + end_blk_no: usize, + /// Start offset and metadata for each blob in this read + blobs_at: VecMap, + max_read_size: Option, + /// Chunk size reads are coalesced into. + chunk_size: usize, +} + +/// Computes x / d rounded up. +fn div_round_up(x: usize, d: usize) -> usize { + (x + (d - 1)) / d +} + +impl ChunkedVectoredReadBuilder { + /// Start building a new vectored read. + /// + /// Note that by design, this does not check against reading more than `max_read_size` to + /// support reading larger blobs than the configuration value. The builder will be single use + /// however after that. + pub(crate) fn new( + start_offset: u64, + end_offset: u64, + meta: BlobMeta, + max_read_size: Option, + chunk_size: usize, + ) -> Self { + let mut blobs_at = VecMap::default(); + blobs_at + .append(start_offset, meta) + .expect("First insertion always succeeds"); + + let start_blk_no = start_offset as usize / chunk_size; + let end_blk_no = div_round_up(end_offset as usize, chunk_size); + Self { + start_blk_no, + end_blk_no, + blobs_at, + max_read_size, + chunk_size, + } + } + + /// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk. + /// + /// The resulting size also must be below the max read size. + pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended { + tracing::trace!(start, end, "trying to extend"); + let start_blk_no = start as usize / self.chunk_size; + let end_blk_no = div_round_up(end as usize, self.chunk_size); + + let not_limited_by_max_read_size = { + if let Some(max_read_size) = self.max_read_size { + let coalesced_size = (end_blk_no - self.start_blk_no) * self.chunk_size; + coalesced_size <= max_read_size + } else { + true + } + }; + + // True if the second block starts in the same block or the immediate next block where the first block ended. + // + // Note: This automatically handles the case where two blocks are adjacent to each other, + // whether they starts on chunk size boundary or not. + let is_adjacent_chunk_read = { + // 1. first.end & second.start are in the same block + self.end_blk_no == start_blk_no + 1 || + // 2. first.end ends one block before second.start + self.end_blk_no == start_blk_no + }; + + if is_adjacent_chunk_read && not_limited_by_max_read_size { + self.end_blk_no = end_blk_no; + self.blobs_at + .append(start, meta) + .expect("LSNs are ordered within vectored reads"); + + return VectoredReadExtended::Yes; + } + + VectoredReadExtended::No + } + + pub(crate) fn size(&self) -> usize { + (self.end_blk_no - self.start_blk_no) * self.chunk_size + } + + pub(crate) fn build(self) -> VectoredRead { + let start = (self.start_blk_no * self.chunk_size) as u64; + let end = (self.end_blk_no * self.chunk_size) as u64; + VectoredRead { + start, + end, + blobs_at: self.blobs_at, + } + } +} + #[derive(Copy, Clone, Debug)] pub enum BlobFlag { None, @@ -166,14 +360,18 @@ pub struct VectoredReadPlanner { prev: Option<(Key, Lsn, u64, BlobFlag)>, max_read_size: usize, + + mode: VectoredReadCoalesceMode, } impl VectoredReadPlanner { pub fn new(max_read_size: usize) -> Self { + let mode = VectoredReadCoalesceMode::get(); Self { blobs: BTreeMap::new(), prev: None, max_read_size, + mode, } } @@ -252,6 +450,7 @@ impl VectoredReadPlanner { end_offset, BlobMeta { key, lsn }, self.max_read_size, + self.mode, ); let prev_read_builder = current_read_builder.replace(next_read_builder); @@ -303,6 +502,18 @@ impl<'a> VectoredBlobReader<'a> { read.size(), buf.capacity() ); + + if cfg!(debug_assertions) { + let align = virtual_file::get_io_buffer_alignment() as u64; + debug_assert_eq!( + read.start % align, + 0, + "Read start at {} does not satisfy the required io buffer alignment ({} bytes)", + read.start, + align + ); + } + let mut buf = self .file .read_exact_at(buf.slice(0..read.size()), read.start, ctx) @@ -310,27 +521,20 @@ impl<'a> VectoredBlobReader<'a> { .into_inner(); let blobs_at = read.blobs_at.as_slice(); - let start_offset = blobs_at.first().expect("VectoredRead is never empty").0; + + let start_offset = read.start; let mut metas = Vec::with_capacity(blobs_at.len()); - // Blobs in `read` only provide their starting offset. The end offset // of a blob is implicit: the start of the next blob if one exists // or the end of the read. - let pairs = blobs_at.iter().zip( - blobs_at - .iter() - .map(Some) - .skip(1) - .chain(std::iter::once(None)), - ); // Some scratch space, put here for reusing the allocation let mut decompressed_vec = Vec::new(); - for ((offset, meta), next) in pairs { - let offset_in_buf = offset - start_offset; - let first_len_byte = buf[offset_in_buf as usize]; + for (blob_start, meta) in blobs_at { + let blob_start_in_buf = blob_start - start_offset; + let first_len_byte = buf[blob_start_in_buf as usize]; // Each blob is prefixed by a header containing its size and compression information. // Extract the size and skip that header to find the start of the data. @@ -340,7 +544,7 @@ impl<'a> VectoredBlobReader<'a> { (1, first_len_byte as u64, BYTE_UNCOMPRESSED) } else { let mut blob_size_buf = [0u8; 4]; - let offset_in_buf = offset_in_buf as usize; + let offset_in_buf = blob_start_in_buf as usize; blob_size_buf.copy_from_slice(&buf[offset_in_buf..offset_in_buf + 4]); blob_size_buf[0] &= !LEN_COMPRESSION_BIT_MASK; @@ -353,12 +557,8 @@ impl<'a> VectoredBlobReader<'a> { ) }; - let start_raw = offset_in_buf + size_length; - let end_raw = match next { - Some((next_blob_start_offset, _)) => next_blob_start_offset - start_offset, - None => start_raw + blob_size, - }; - assert_eq!(end_raw - start_raw, blob_size); + let start_raw = blob_start_in_buf + size_length; + let end_raw = start_raw + blob_size; let (start, end); if compression_bits == BYTE_UNCOMPRESSED { start = start_raw as usize; @@ -407,18 +607,22 @@ pub struct StreamingVectoredReadPlanner { max_cnt: usize, /// Size of the current batch cnt: usize, + + mode: VectoredReadCoalesceMode, } impl StreamingVectoredReadPlanner { pub fn new(max_read_size: u64, max_cnt: usize) -> Self { assert!(max_cnt > 0); assert!(max_read_size > 0); + let mode = VectoredReadCoalesceMode::get(); Self { read_builder: None, prev: None, max_cnt, max_read_size, cnt: 0, + mode, } } @@ -467,17 +671,12 @@ impl StreamingVectoredReadPlanner { } None => { self.read_builder = { - let mut blobs_at = VecMap::default(); - blobs_at - .append(start_offset, BlobMeta { key, lsn }) - .expect("First insertion always succeeds"); - - Some(VectoredReadBuilder { - start: start_offset, - end: end_offset, - blobs_at, - max_read_size: None, - }) + Some(VectoredReadBuilder::new_streaming( + start_offset, + end_offset, + BlobMeta { key, lsn }, + self.mode, + )) }; } } @@ -511,7 +710,9 @@ mod tests { use super::*; fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) { - assert_eq!(read.start, offset_range.first().unwrap().2); + let align = virtual_file::get_io_buffer_alignment() as u64; + assert_eq!(read.start % align, 0); + assert_eq!(read.start / align, offset_range.first().unwrap().2 / align); let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect(); @@ -525,6 +726,68 @@ mod tests { assert_eq!(expected_offsets_in_read, offsets_in_read); } + #[test] + fn planner_chunked_coalesce_all_test() { + use crate::virtual_file; + + let chunk_size = virtual_file::get_io_buffer_alignment() as u64; + + // The test explicitly does not check chunk size < 512 + if chunk_size < 512 { + return; + } + + let max_read_size = chunk_size as usize * 8; + let key = Key::MIN; + let lsn = Lsn(0); + + let blob_descriptions = [ + (key, lsn, chunk_size / 8, BlobFlag::None), // Read 1 BEGIN + (key, lsn, chunk_size / 4, BlobFlag::Ignore), // Gap + (key, lsn, chunk_size / 2, BlobFlag::None), + (key, lsn, chunk_size - 2, BlobFlag::Ignore), // Gap + (key, lsn, chunk_size, BlobFlag::None), + (key, lsn, chunk_size * 2 - 1, BlobFlag::None), + (key, lsn, chunk_size * 2 + 1, BlobFlag::Ignore), // Gap + (key, lsn, chunk_size * 3 + 1, BlobFlag::None), + (key, lsn, chunk_size * 5 + 1, BlobFlag::None), + (key, lsn, chunk_size * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce. + (key, lsn, chunk_size * 7 + 1, BlobFlag::None), + (key, lsn, chunk_size * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size) + (key, lsn, chunk_size * 9, BlobFlag::Ignore), // ==== skipped a chunk + (key, lsn, chunk_size * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce) + ]; + + let ranges = [ + &[ + blob_descriptions[0], + blob_descriptions[2], + blob_descriptions[4], + blob_descriptions[5], + blob_descriptions[7], + blob_descriptions[8], + blob_descriptions[10], + ], + &blob_descriptions[11..12], + &blob_descriptions[13..], + ]; + + let mut planner = VectoredReadPlanner::new(max_read_size); + for (key, lsn, offset, flag) in blob_descriptions { + planner.handle(key, lsn, offset, flag); + } + + planner.handle_range_end(652 * 1024); + + let reads = planner.finish(); + + assert_eq!(reads.len(), ranges.len()); + + for (idx, read) in reads.iter().enumerate() { + validate_read(read, ranges[idx]); + } + } + #[test] fn planner_max_read_size_test() { let max_read_size = 128 * 1024; @@ -571,18 +834,19 @@ mod tests { #[test] fn planner_replacement_test() { - let max_read_size = 128 * 1024; + let chunk_size = virtual_file::get_io_buffer_alignment() as u64; + let max_read_size = 128 * chunk_size as usize; let first_key = Key::MIN; let second_key = first_key.next(); let lsn = Lsn(0); let blob_descriptions = vec![ - (first_key, lsn, 0, BlobFlag::None), // First in read 1 - (first_key, lsn, 1024, BlobFlag::None), // Last in read 1 - (second_key, lsn, 2 * 1024, BlobFlag::ReplaceAll), - (second_key, lsn, 3 * 1024, BlobFlag::None), - (second_key, lsn, 4 * 1024, BlobFlag::ReplaceAll), // First in read 2 - (second_key, lsn, 5 * 1024, BlobFlag::None), // Last in read 2 + (first_key, lsn, 0, BlobFlag::None), // First in read 1 + (first_key, lsn, chunk_size, BlobFlag::None), // Last in read 1 + (second_key, lsn, 2 * chunk_size, BlobFlag::ReplaceAll), + (second_key, lsn, 3 * chunk_size, BlobFlag::None), + (second_key, lsn, 4 * chunk_size, BlobFlag::ReplaceAll), // First in read 2 + (second_key, lsn, 5 * chunk_size, BlobFlag::None), // Last in read 2 ]; let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]]; @@ -592,7 +856,7 @@ mod tests { planner.handle(key, lsn, offset, flag); } - planner.handle_range_end(6 * 1024); + planner.handle_range_end(6 * chunk_size); let reads = planner.finish(); assert_eq!(reads.len(), 2); @@ -737,6 +1001,7 @@ mod tests { let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16; let mut buf = BytesMut::with_capacity(reserved_bytes); + let mode = VectoredReadCoalesceMode::get(); let vectored_blob_reader = VectoredBlobReader::new(&file); let meta = BlobMeta { key: Key::MIN, @@ -748,7 +1013,7 @@ mod tests { if idx + 1 == offsets.len() { continue; } - let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096); + let read_builder = VectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, mode); let read = read_builder.build(); let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?; assert_eq!(result.blobs.len(), 1); @@ -784,4 +1049,12 @@ mod tests { round_trip_test_compressed(&blobs, true).await?; Ok(()) } + + #[test] + fn test_div_round_up() { + const CHUNK_SIZE: usize = 512; + assert_eq!(1, div_round_up(200, CHUNK_SIZE)); + assert_eq!(1, div_round_up(CHUNK_SIZE, CHUNK_SIZE)); + assert_eq!(2, div_round_up(CHUNK_SIZE + 1, CHUNK_SIZE)); + } } diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index c0017280fd..ed6ff86c10 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -18,6 +18,7 @@ use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; use owned_buffers_io::io_buf_ext::FullSlice; +use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT; use pageserver_api::shard::TenantShardId; use std::fs::File; use std::io::{Error, ErrorKind, Seek, SeekFrom}; @@ -1140,10 +1141,13 @@ impl OpenFiles { /// server startup. /// #[cfg(not(test))] -pub fn init(num_slots: usize, engine: IoEngineKind) { +pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize) { if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() { panic!("virtual_file::init called twice"); } + if set_io_buffer_alignment(io_buffer_alignment).is_err() { + panic!("IO buffer alignment ({io_buffer_alignment}) is not a power of two"); + } io_engine::init(engine); crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64); } @@ -1167,6 +1171,53 @@ fn get_open_files() -> &'static OpenFiles { } } +static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT); + +/// Returns true if `x` is zero or a power of two. +fn is_zero_or_power_of_two(x: usize) -> bool { + (x == 0) || ((x & (x - 1)) == 0) +} + +#[allow(unused)] +pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> { + if is_zero_or_power_of_two(align) { + IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed); + Ok(()) + } else { + Err(align) + } +} + +/// Gets the io buffer alignment requirement. Returns 0 if there is no requirement specified. +/// +/// This function should be used to check the raw config value. +pub(crate) fn get_io_buffer_alignment_raw() -> usize { + let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed); + + if cfg!(test) { + let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT"; + if let Some(test_align) = utils::env::var(env_var_name) { + if is_zero_or_power_of_two(test_align) { + test_align + } else { + panic!("IO buffer alignment ({test_align}) is not a power of two"); + } + } else { + align + } + } else { + align + } +} + +/// Gets the io buffer alignment requirement. Returns 1 if the alignment config is set to zero. +/// +/// This function should be used for getting the actual alignment value to use. +pub(crate) fn get_io_buffer_alignment() -> usize { + let align = get_io_buffer_alignment_raw(); + align.max(1) +} + #[cfg(test)] mod tests { use crate::context::DownloadBehavior; diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index faef1ba9ff..ccde90ee1a 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -84,9 +84,14 @@ pub(crate) fn get() -> IoEngine { } }, Err(std::env::VarError::NotPresent) => { - crate::config::defaults::DEFAULT_VIRTUAL_FILE_IO_ENGINE - .parse() - .unwrap() + #[cfg(target_os = "linux")] + { + IoEngineKind::TokioEpollUring + } + #[cfg(not(target_os = "linux"))] + { + IoEngineKind::StdFs + } } Err(std::env::VarError::NotUnicode(_)) => { panic!("env var {env_var_name} is not unicode"); diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs index f8f37b17e3..568cf62e56 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs @@ -78,6 +78,7 @@ where .expect("must not use after we returned an error") } + /// Guarantees that if Ok() is returned, all bytes in `chunk` have been accepted. #[cfg_attr(target_os = "macos", allow(dead_code))] pub async fn write_buffered( &mut self, diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 8425528740..2d3841881b 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -21,19 +21,25 @@ //! redo Postgres process, but some records it can handle directly with //! bespoken Rust code. +use std::time::Duration; +use std::time::SystemTime; + use pageserver_api::shard::ShardIdentity; use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes; use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment; +use postgres_ffi::TimestampTz; use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn}; use anyhow::{bail, Context, Result}; use bytes::{Buf, Bytes, BytesMut}; use tracing::*; use utils::failpoint_support; +use utils::rate_limit::RateLimit; use crate::context::RequestContext; use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::{DatadirModification, Version}; +use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::PageReconstructError; use crate::tenant::Timeline; use crate::walrecord::*; @@ -51,8 +57,16 @@ use utils::lsn::Lsn; pub struct WalIngest { shard: ShardIdentity, + pg_version: u32, checkpoint: CheckPoint, checkpoint_modified: bool, + warn_ingest_lag: WarnIngestLag, +} + +struct WarnIngestLag { + lag_msg_ratelimit: RateLimit, + future_lsn_msg_ratelimit: RateLimit, + timestamp_invalid_msg_ratelimit: RateLimit, } impl WalIngest { @@ -69,8 +83,14 @@ impl WalIngest { Ok(WalIngest { shard: *timeline.get_shard_identity(), + pg_version: timeline.pg_version, checkpoint, checkpoint_modified: false, + warn_ingest_lag: WarnIngestLag { + lag_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)), + future_lsn_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)), + timestamp_invalid_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)), + }, }) } @@ -86,10 +106,9 @@ impl WalIngest { /// pub async fn ingest_record( &mut self, - recdata: Bytes, + decoded: DecodedWALRecord, lsn: Lsn, modification: &mut DatadirModification<'_>, - decoded: &mut DecodedWALRecord, ctx: &RequestContext, ) -> anyhow::Result { WAL_INGEST.records_received.inc(); @@ -97,7 +116,12 @@ impl WalIngest { let prev_len = modification.len(); modification.set_lsn(lsn)?; - decode_wal_record(recdata, decoded, pg_version)?; + + if decoded.is_dbase_create_copy(self.pg_version) { + // Records of this type should always be preceded by a commit(), as they + // rely on reading data pages back from the Timeline. + assert!(!modification.has_dirty_data_pages()); + } let mut buf = decoded.record.clone(); buf.advance(decoded.main_data_offset); @@ -115,11 +139,11 @@ impl WalIngest { pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => { // Heap AM records need some special handling, because they modify VM pages // without registering them with the standard mechanism. - self.ingest_heapam_record(&mut buf, modification, decoded, ctx) + self.ingest_heapam_record(&mut buf, modification, &decoded, ctx) .await?; } pg_constants::RM_NEON_ID => { - self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx) + self.ingest_neonrmgr_record(&mut buf, modification, &decoded, ctx) .await?; } // Handle other special record types @@ -307,7 +331,7 @@ impl WalIngest { } pg_constants::RM_RELMAP_ID => { let xlrec = XlRelmapUpdate::decode(&mut buf); - self.ingest_relmap_page(modification, &xlrec, decoded, ctx) + self.ingest_relmap_page(modification, &xlrec, &decoded, ctx) .await?; } pg_constants::RM_XLOG_ID => { @@ -452,7 +476,7 @@ impl WalIngest { continue; } - self.ingest_decoded_block(modification, lsn, decoded, blk, ctx) + self.ingest_decoded_block(modification, lsn, &decoded, blk, ctx) .await?; } @@ -468,6 +492,8 @@ impl WalIngest { // until commit() is called to flush the data into the repository and update // the latest LSN. + modification.on_record_end(); + Ok(modification.len() > prev_len) } @@ -539,6 +565,7 @@ impl WalIngest { page_set_lsn(&mut image, lsn) } assert_eq!(image.len(), BLCKSZ as usize); + self.put_rel_page_image(modification, rel, blk.blkno, image.freeze(), ctx) .await?; } else { @@ -1177,7 +1204,7 @@ impl WalIngest { if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 { // Tail of last remaining FSM page has to be zeroed. // We are not precise here and instead of digging in FSM bitmap format just clear the whole page. - modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?; + modification.put_rel_page_image_zero(rel, fsm_physical_page_no); fsm_physical_page_no += 1; } let nblocks = get_relsize(modification, rel, ctx).await?; @@ -1199,7 +1226,7 @@ impl WalIngest { if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 { // Tail of last remaining vm page has to be zeroed. // We are not precise here and instead of digging in VM bitmap format just clear the whole page. - modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?; + modification.put_rel_page_image_zero(rel, vm_page_no); vm_page_no += 1; } let nblocks = get_relsize(modification, rel, ctx).await?; @@ -1212,6 +1239,48 @@ impl WalIngest { Ok(()) } + fn warn_on_ingest_lag( + &mut self, + conf: &crate::config::PageServerConf, + wal_timestmap: TimestampTz, + ) { + debug_assert_current_span_has_tenant_and_timeline_id(); + let now = SystemTime::now(); + let rate_limits = &mut self.warn_ingest_lag; + match try_from_pg_timestamp(wal_timestmap) { + Ok(ts) => { + match now.duration_since(ts) { + Ok(lag) => { + if lag > conf.wait_lsn_timeout { + rate_limits.lag_msg_ratelimit.call2(|rate_limit_stats| { + let lag = humantime::format_duration(lag); + warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout"); + }) + } + }, + Err(e) => { + let delta_t = e.duration(); + // determined by prod victoriametrics query: 1000 * (timestamp(node_time_seconds{neon_service="pageserver"}) - node_time_seconds) + // => https://www.robustperception.io/time-metric-from-the-node-exporter/ + const IGNORED_DRIFT: Duration = Duration::from_millis(100); + if delta_t > IGNORED_DRIFT { + let delta_t = humantime::format_duration(delta_t); + rate_limits.future_lsn_msg_ratelimit.call2(|rate_limit_stats| { + warn!(%rate_limit_stats, %delta_t, "ingesting record with timestamp from future"); + }) + } + } + }; + + } + Err(error) => { + rate_limits.timestamp_invalid_msg_ratelimit.call2(|rate_limit_stats| { + warn!(%rate_limit_stats, %error, "ingesting record with invalid timestamp, cannot calculate lag and will fail find-lsn-for-timestamp type queries"); + }) + } + } + } + /// Subroutine of ingest_record(), to handle an XLOG_XACT_* records. /// async fn ingest_xact_record( @@ -1228,6 +1297,8 @@ impl WalIngest { let mut rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT; let mut page_xids: Vec = vec![parsed.xid]; + self.warn_on_ingest_lag(modification.tline.conf, parsed.xact_time); + for subxact in &parsed.subxacts { let subxact_pageno = subxact / pg_constants::CLOG_XACTS_PER_PAGE; if subxact_pageno != pageno { @@ -1625,7 +1696,7 @@ impl WalIngest { continue; } - modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?; + modification.put_rel_page_image_zero(rel, gap_blknum); } } Ok(()) @@ -1691,7 +1762,7 @@ impl WalIngest { // fill the gap with zeros for gap_blknum in old_nblocks..blknum { - modification.put_slru_page_image(kind, segno, gap_blknum, ZERO_PAGE.clone())?; + modification.put_slru_page_image_zero(kind, segno, gap_blknum); } } Ok(()) @@ -1765,21 +1836,25 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x30)); walingest .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x40)); walingest .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; let mut m = tline.begin_modification(Lsn(0x50)); walingest .put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; assert_current_logical_size(&tline, Lsn(0x50)); @@ -1921,6 +1996,7 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; assert_eq!( tline @@ -1946,6 +2022,7 @@ mod tests { walingest .put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx) .await?; + m.on_record_end(); m.commit(&ctx).await?; assert_eq!( tline @@ -2303,6 +2380,9 @@ mod tests { let _endpoint = Lsn::from_hex("1FFFF98").unwrap(); let harness = TenantHarness::create("test_ingest_real_wal").await.unwrap(); + let span = harness + .span() + .in_scope(|| info_span!("timeline_span", timeline_id=%TIMELINE_ID)); let (tenant, ctx) = harness.load().await; let remote_initdb_path = @@ -2344,7 +2424,6 @@ mod tests { .await .unwrap(); let mut modification = tline.begin_modification(startpoint); - let mut decoded = DecodedWALRecord::default(); println!("decoding {} bytes", bytes.len() - xlogoff); // Decode and ingest wal. We process the wal in chunks because @@ -2352,8 +2431,11 @@ mod tests { for chunk in bytes[xlogoff..].chunks(50) { decoder.feed_bytes(chunk); while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() { + let mut decoded = DecodedWALRecord::default(); + decode_wal_record(recdata, &mut decoded, modification.tline.pg_version).unwrap(); walingest - .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx) + .ingest_record(decoded, lsn, &mut modification, &ctx) + .instrument(span.clone()) .await .unwrap(); } diff --git a/pageserver/src/walrecord.rs b/pageserver/src/walrecord.rs index edddcefbe1..0c4d575de8 100644 --- a/pageserver/src/walrecord.rs +++ b/pageserver/src/walrecord.rs @@ -160,6 +160,30 @@ pub struct DecodedWALRecord { pub origin_id: u16, } +impl DecodedWALRecord { + /// Check if this WAL record represents a legacy "copy" database creation, which populates new relations + /// by reading other existing relations' data blocks. This is more complex to apply than new-style database + /// creations which simply include all the desired blocks in the WAL, so we need a helper function to detect this case. + pub(crate) fn is_dbase_create_copy(&self, pg_version: u32) -> bool { + if self.xl_rmid == pg_constants::RM_DBASE_ID { + let info = self.xl_info & pg_constants::XLR_RMGR_INFO_MASK; + match pg_version { + 14 => { + // Postgres 14 database creations are always the legacy kind + info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE + } + 15 => info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY, + 16 => info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY, + _ => { + panic!("Unsupported postgres version {pg_version}") + } + } + } else { + false + } + } +} + #[repr(C)] #[derive(Debug, Clone, Copy)] pub struct RelFileNode { diff --git a/poetry.lock b/poetry.lock index 7db91e51f7..48943a73e9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -985,43 +985,38 @@ files = [ [[package]] name = "cryptography" -version = "42.0.4" +version = "43.0.1" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false python-versions = ">=3.7" files = [ - {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:ffc73996c4fca3d2b6c1c8c12bfd3ad00def8621da24f547626bf06441400449"}, - {file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:db4b65b02f59035037fde0998974d84244a64c3265bdef32a827ab9b63d61b18"}, - {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad9c385ba8ee025bb0d856714f71d7840020fe176ae0229de618f14dae7a6e2"}, - {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69b22ab6506a3fe483d67d1ed878e1602bdd5912a134e6202c1ec672233241c1"}, - {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:e09469a2cec88fb7b078e16d4adec594414397e8879a4341c6ace96013463d5b"}, - {file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3e970a2119507d0b104f0a8e281521ad28fc26f2820687b3436b8c9a5fcf20d1"}, - {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:e53dc41cda40b248ebc40b83b31516487f7db95ab8ceac1f042626bc43a2f992"}, - {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:c3a5cbc620e1e17009f30dd34cb0d85c987afd21c41a74352d1719be33380885"}, - {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6bfadd884e7280df24d26f2186e4e07556a05d37393b0f220a840b083dc6a824"}, - {file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:01911714117642a3f1792c7f376db572aadadbafcd8d75bb527166009c9f1d1b"}, - {file = "cryptography-42.0.4-cp37-abi3-win32.whl", hash = "sha256:fb0cef872d8193e487fc6bdb08559c3aa41b659a7d9be48b2e10747f47863925"}, - {file = "cryptography-42.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c1f25b252d2c87088abc8bbc4f1ecbf7c919e05508a7e8628e6875c40bc70923"}, - {file = "cryptography-42.0.4-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:15a1fb843c48b4a604663fa30af60818cd28f895572386e5f9b8a665874c26e7"}, - {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1327f280c824ff7885bdeef8578f74690e9079267c1c8bd7dc5cc5aa065ae52"}, - {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ffb03d419edcab93b4b19c22ee80c007fb2d708429cecebf1dd3258956a563a"}, - {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:1df6fcbf60560d2113b5ed90f072dc0b108d64750d4cbd46a21ec882c7aefce9"}, - {file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:44a64043f743485925d3bcac548d05df0f9bb445c5fcca6681889c7c3ab12764"}, - {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3c6048f217533d89f2f8f4f0fe3044bf0b2090453b7b73d0b77db47b80af8dff"}, - {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6d0fbe73728c44ca3a241eff9aefe6496ab2656d6e7a4ea2459865f2e8613257"}, - {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:887623fe0d70f48ab3f5e4dbf234986b1329a64c066d719432d0698522749929"}, - {file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ce8613beaffc7c14f091497346ef117c1798c202b01153a8cc7b8e2ebaaf41c0"}, - {file = "cryptography-42.0.4-cp39-abi3-win32.whl", hash = "sha256:810bcf151caefc03e51a3d61e53335cd5c7316c0a105cc695f0959f2c638b129"}, - {file = "cryptography-42.0.4-cp39-abi3-win_amd64.whl", hash = "sha256:a0298bdc6e98ca21382afe914c642620370ce0470a01e1bef6dd9b5354c36854"}, - {file = "cryptography-42.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5f8907fcf57392cd917892ae83708761c6ff3c37a8e835d7246ff0ad251d9298"}, - {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:12d341bd42cdb7d4937b0cabbdf2a94f949413ac4504904d0cdbdce4a22cbf88"}, - {file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1cdcdbd117681c88d717437ada72bdd5be9de117f96e3f4d50dab3f59fd9ab20"}, - {file = "cryptography-42.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0e89f7b84f421c56e7ff69f11c441ebda73b8a8e6488d322ef71746224c20fce"}, - {file = "cryptography-42.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f1e85a178384bf19e36779d91ff35c7617c885da487d689b05c1366f9933ad74"}, - {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d2a27aca5597c8a71abbe10209184e1a8e91c1fd470b5070a2ea60cafec35bcd"}, - {file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4e36685cb634af55e0677d435d425043967ac2f3790ec652b2b88ad03b85c27b"}, - {file = "cryptography-42.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f47be41843200f7faec0683ad751e5ef11b9a56a220d57f300376cd8aba81660"}, - {file = "cryptography-42.0.4.tar.gz", hash = "sha256:831a4b37accef30cccd34fcb916a5d7b5be3cbbe27268a02832c3e450aea39cb"}, + {file = "cryptography-43.0.1-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:8385d98f6a3bf8bb2d65a73e17ed87a3ba84f6991c155691c51112075f9ffc5d"}, + {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27e613d7077ac613e399270253259d9d53872aaf657471473ebfc9a52935c062"}, + {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68aaecc4178e90719e95298515979814bda0cbada1256a4485414860bd7ab962"}, + {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:de41fd81a41e53267cb020bb3a7212861da53a7d39f863585d13ea11049cf277"}, + {file = "cryptography-43.0.1-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f98bf604c82c416bc829e490c700ca1553eafdf2912a91e23a79d97d9801372a"}, + {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:61ec41068b7b74268fa86e3e9e12b9f0c21fcf65434571dbb13d954bceb08042"}, + {file = "cryptography-43.0.1-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:014f58110f53237ace6a408b5beb6c427b64e084eb451ef25a28308270086494"}, + {file = "cryptography-43.0.1-cp37-abi3-win32.whl", hash = "sha256:2bd51274dcd59f09dd952afb696bf9c61a7a49dfc764c04dd33ef7a6b502a1e2"}, + {file = "cryptography-43.0.1-cp37-abi3-win_amd64.whl", hash = "sha256:666ae11966643886c2987b3b721899d250855718d6d9ce41b521252a17985f4d"}, + {file = "cryptography-43.0.1-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:ac119bb76b9faa00f48128b7f5679e1d8d437365c5d26f1c2c3f0da4ce1b553d"}, + {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1bbcce1a551e262dfbafb6e6252f1ae36a248e615ca44ba302df077a846a8806"}, + {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58d4e9129985185a06d849aa6df265bdd5a74ca6e1b736a77959b498e0505b85"}, + {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:d03a475165f3134f773d1388aeb19c2d25ba88b6a9733c5c590b9ff7bbfa2e0c"}, + {file = "cryptography-43.0.1-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:511f4273808ab590912a93ddb4e3914dfd8a388fed883361b02dea3791f292e1"}, + {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:80eda8b3e173f0f247f711eef62be51b599b5d425c429b5d4ca6a05e9e856baa"}, + {file = "cryptography-43.0.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:38926c50cff6f533f8a2dae3d7f19541432610d114a70808f0926d5aaa7121e4"}, + {file = "cryptography-43.0.1-cp39-abi3-win32.whl", hash = "sha256:a575913fb06e05e6b4b814d7f7468c2c660e8bb16d8d5a1faf9b33ccc569dd47"}, + {file = "cryptography-43.0.1-cp39-abi3-win_amd64.whl", hash = "sha256:d75601ad10b059ec832e78823b348bfa1a59f6b8d545db3a24fd44362a1564cb"}, + {file = "cryptography-43.0.1-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:ea25acb556320250756e53f9e20a4177515f012c9eaea17eb7587a8c4d8ae034"}, + {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c1332724be35d23a854994ff0b66530119500b6053d0bd3363265f7e5e77288d"}, + {file = "cryptography-43.0.1-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:fba1007b3ef89946dbbb515aeeb41e30203b004f0b4b00e5e16078b518563289"}, + {file = "cryptography-43.0.1-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:5b43d1ea6b378b54a1dc99dd8a2b5be47658fe9a7ce0a58ff0b55f4b43ef2b84"}, + {file = "cryptography-43.0.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:88cce104c36870d70c49c7c8fd22885875d950d9ee6ab54df2745f83ba0dc365"}, + {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:9d3cdb25fa98afdd3d0892d132b8d7139e2c087da1712041f6b762e4f807cc96"}, + {file = "cryptography-43.0.1-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:e710bf40870f4db63c3d7d929aa9e09e4e7ee219e703f949ec4073b4294f6172"}, + {file = "cryptography-43.0.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7c05650fe8023c5ed0d46793d4b7d7e6cd9c04e68eabe5b0aeea836e37bdcec2"}, + {file = "cryptography-43.0.1.tar.gz", hash = "sha256:203e92a75716d8cfb491dc47c79e17d0d9207ccffcbcb35f598fbe463ae3444d"}, ] [package.dependencies] @@ -1034,7 +1029,7 @@ nox = ["nox"] pep8test = ["check-sdist", "click", "mypy", "ruff"] sdist = ["build"] ssh = ["bcrypt (>=3.1.5)"] -test = ["certifi", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] +test = ["certifi", "cryptography-vectors (==43.0.1)", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"] test-randomorder = ["pytest-randomly"] [[package]] @@ -1110,13 +1105,13 @@ dotenv = ["python-dotenv"] [[package]] name = "flask-cors" -version = "4.0.1" +version = "5.0.0" description = "A Flask extension adding a decorator for CORS support" optional = false python-versions = "*" files = [ - {file = "Flask_Cors-4.0.1-py2.py3-none-any.whl", hash = "sha256:f2a704e4458665580c074b714c4627dd5a306b333deb9074d0b1794dfa2fb677"}, - {file = "flask_cors-4.0.1.tar.gz", hash = "sha256:eeb69b342142fdbf4766ad99357a7f3876a2ceb77689dc10ff912aac06c389e4"}, + {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"}, + {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"}, ] [package.dependencies] diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index e98da82053..1f44e4af5d 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -500,6 +500,7 @@ mod tests { use hyper1::service::service_fn; use hyper_util::rt::TokioIo; use rand::rngs::OsRng; + use rsa::pkcs8::DecodePrivateKey; use signature::Signer; use tokio::net::TcpListener; @@ -517,8 +518,8 @@ mod tests { (sk, jwk) } - fn new_rsa_jwk(kid: String) -> (rsa::RsaPrivateKey, jose_jwk::Jwk) { - let sk = rsa::RsaPrivateKey::new(&mut OsRng, 2048).unwrap(); + fn new_rsa_jwk(key: &str, kid: String) -> (rsa::RsaPrivateKey, jose_jwk::Jwk) { + let sk = rsa::RsaPrivateKey::from_pkcs8_pem(key).unwrap(); let pk = sk.to_public_key().into(); let jwk = jose_jwk::Jwk { key: jose_jwk::Key::Rsa(pk), @@ -569,10 +570,70 @@ mod tests { format!("{payload}.{sig}") } + // RSA key gen is slow.... + const RS1: &str = "-----BEGIN PRIVATE KEY----- +MIIEvwIBADANBgkqhkiG9w0BAQEFAASCBKkwggSlAgEAAoIBAQDNuWBIWTlo+54Y +aifpGInIrpv6LlsbI/2/2CC81Arlx4RsABORklgA9XSGwaCbHTshHsfd1S916JwA +SpjyPQYWfqo6iAV8a4MhjIeJIkRr74prDCSzOGZvIc6VaGeCIb9clf3HSrPHm3hA +cfLMB8/p5MgoxERPDOIn3XYoS9SEEuP7l0LkmEZMerg6W6lDjQRDny0Lb50Jky9X +mDqnYXBhs99ranbwL5vjy0ba6OIeCWFJme5u+rv5C/P0BOYrJfGxIcEoKa8Ukw5s +PlM+qrz9ope1eOuXMNNdyFDReNBUyaM1AwBAayU5rz57crer7K/UIofaJ42T4cMM +nx/SWfBNAgMBAAECggEACqdpBxYn1PoC6/zDaFzu9celKEWyTiuE/qRwvZa1ocS9 +ZOJ0IPvVNud/S2NHsADJiSOQ8joSJScQvSsf1Ju4bv3MTw+wSQtAVUJz2nQ92uEi +5/xPAkEPfP3hNvebNLAOuvrBk8qYmOPCTIQaMNrOt6wzeXkAmJ9wLuRXNCsJLHW+ +KLpf2WdgTYxqK06ZiJERFgJ2r1MsC2IgTydzjOAdEIrtMarerTLqqCpwFrk/l0cz +1O2OAb17ZxmhuzMhjNMin81c8F2fZAGMeOjn92Jl5kUsYw/pG+0S8QKlbveR/fdP +We2tJsgXw2zD0q7OJpp8NXS2yddrZGyysYsof983wQKBgQD2McqNJqo+eWL5zony +UbL19loYw0M15EjhzIuzW1Jk0rPj65yQyzpJ6pqicRuWr34MvzCx+ZHM2b3jSiNu +GES2fnC7xLIKyeRxfqsXF71xz+6UStEGRQX27r1YWEtyQVuBhvlqB+AGWP3PYAC+ +HecZecnZ+vcihJ2K3+l5O3paVQKBgQDV6vKH5h2SY9vgO8obx0P7XSS+djHhmPuU +f8C/Fq6AuRbIA1g04pzuLU2WS9T26eIjgM173uVNg2TuqJveWzz+CAAp6nCR6l24 +DBg49lMGCWrMo4FqPG46QkUqvK8uSj42GkX/e5Rut1Gyu0209emeM6h2d2K15SvY +9563tYSmGQKBgQDwcH5WTi20KA7e07TroJi8GKWzS3gneNUpGQBS4VxdtV4UuXXF +/4TkzafJ/9cm2iurvUmMd6XKP9lw0mY5zp/E70WgTCBp4vUlVsU3H2tYbO+filYL +3ntNx6nKTykX4/a/UJfj0t8as+zli+gNxNx/h+734V9dKdFG4Rl+2fTLpQKBgQCE +qJkTEe+Q0wCOBEYICADupwqcWqwAXWDW7IrZdfVtulqYWwqecVIkmk+dPxWosc4d +ekjz4nyNH0i+gC15LVebqdaAJ/T7aD4KXuW+nXNLMRfcJCGjgipRUruWD0EMEdqW +rqBuGXMpXeH6VxGPgVkJVLvKC6tZZe9VM+pnvteuMQKBgQC8GaL+Lz+al4biyZBf +JE8ekWrIotq/gfUBLP7x70+PB9bNtXtlgmTvjgYg4jiu3KR/ZIYYQ8vfVgkb6tDI +rWGZw86Pzuoi1ppg/pYhKk9qrmCIT4HPEXbHl7ATahu2BOCIU3hybjTh2lB6LbX9 +8LMFlz1QPqSZYN/A/kOcLBfa3A== +-----END PRIVATE KEY----- +"; + const RS2: &str = "-----BEGIN PRIVATE KEY----- +MIIEvgIBADANBgkqhkiG9w0BAQEFAASCBKgwggSkAgEAAoIBAQDipm6FIKSRab3J +HwmK18t7hp+pohllxIDUSPi7S5mIhN/JG2Plq2Lp746E/fuT8dcBF2R4sJlG2L0J +zmxOvBU/i/sQF9s1i4CEfg05k2//gKENIEsF3pMMmrH+mcZi0TTD6rezHpdVxPHk +qWxSyOCtIJV29X+wxPwAB59kQFHzy2ooPB1isZcpE8tO0KthAM+oZ3KuCwE0++cO +IWLeq9aPwyKhtip/xjTMxd1kzdKh592mGSyzr9D0QSWOYFGvgJXANDdiPdhSSOLt +ECWPNPlm2FQvGGvYYBafUqz7VumKHE6x8J6lKdYa2J0ZdDzCIo2IHzlxe+RZNgwy +uAD2jhVxAgMBAAECggEAbsZHWBu3MzcKQiVARbLoygvnN0J5xUqAaMDtiKUPejDv +K1yOu67DXnDuKEP2VL2rhuYG/hHaKE1AP227c9PrUq6424m9YvM2sgrlrdFIuQkG +LeMtp8W7+zoUasp/ssZrUqICfLIj5xCl5UuFHQT/Ar7dLlIYwa3VOLKBDb9+Dnfe +QH5/So4uMXG6vw34JN9jf+eAc8Yt0PeIz62ycvRwdpTJQ0MxZN9ZKpCAQp+VTuXT +zlzNvDMilabEdqUvAyGyz8lBLNl0wdaVrqPqAEWM5U45QXsdFZknWammP7/tijeX +0z+Bi0J0uSEU5X502zm7GArj/NNIiWMcjmDjwUUhwQKBgQD9C2GoqxOxuVPYqwYR ++Jz7f2qMjlSP8adA5Lzuh8UKXDp8JCEQC8ryweLzaOKS9C5MAw+W4W2wd4nJoQI1 +P1dgGvBlfvEeRHMgqWtq7FuTsjSe7e0uSEkC4ngDb4sc0QOpv15cMuEz+4+aFLPL +x29EcHWAaBX+rkid3zpQHFU4eQKBgQDlTCEqRuXwwa3V+Sq+mNWzD9QIGtD87TH/ +FPO/Ij/cK2+GISgFDqhetiGTH4qrvPL0psPT+iH5zGFYcoFmTtwLdWQJdxhxz0bg +iX/AceyX5e1Bm+ThT36sU83NrxKPkrdk6jNmr2iUF1OTzTwUKOYdHOPZqdMPfF4M +4XAaWVT2uQKBgQD4nKcNdU+7LE9Rr+4d1/o8Klp/0BMK/ayK2HE7lc8kt6qKb2DA +iCWUTqPw7Fq3cQrPia5WWhNP7pJEtFkcAaiR9sW7onW5fBz0uR+dhK0QtmR2xWJj +N4fsOp8ZGQ0/eae0rh1CTobucLkM9EwV6VLLlgYL67e4anlUCo8bSEr+WQKBgQCB +uf6RgqcY/RqyklPCnYlZ0zyskS9nyXKd1GbK3j+u+swP4LZZlh9f5j88k33LCA2U +qLzmMwAB6cWxWqcnELqhqPq9+ClWSmTZKDGk2U936NfAZMirSGRsbsVi9wfTPriP +WYlXMSpDjqb0WgsBhNob4npubQxCGKTFOM5Jufy90QKBgB0Lte1jX144uaXx6dtB +rjXNuWNir0Jy31wHnQuCA+XnfUgPcrKmRLm8taMbXgZwxkNvgFkpUWU8aPEK08Ne +X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL +5JiconnI5aLek0QVPoFaVXFa +-----END PRIVATE KEY----- +"; + #[tokio::test] async fn renew() { - let (rs1, jwk1) = new_rsa_jwk("1".into()); - let (rs2, jwk2) = new_rsa_jwk("2".into()); + let (rs1, jwk1) = new_rsa_jwk(RS1, "1".into()); + let (rs2, jwk2) = new_rsa_jwk(RS2, "2".into()); let (ec1, jwk3) = new_ec_jwk("3".into()); let (ec2, jwk4) = new_ec_jwk("4".into()); diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index 88caa9a316..c6f83fd069 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -613,40 +613,6 @@ mod tests { tmpdir.close().unwrap(); } - #[tokio::test] - async fn verify_parquet_min_compression() { - let tmpdir = camino_tempfile::tempdir().unwrap(); - - let config = ParquetConfig { - propeties: Arc::new( - WriterProperties::builder() - .set_compression(parquet::basic::Compression::ZSTD(ZstdLevel::default())) - .build(), - ), - rows_per_group: 2_000, - file_size: 1_000_000, - max_duration: time::Duration::from_secs(20 * 60), - test_remote_failures: 0, - }; - - let rx = random_stream(50_000); - let file_stats = run_test(tmpdir.path(), config, rx).await; - - // with compression, there are fewer files with more rows per file - assert_eq!( - file_stats, - [ - (1223214, 5, 10000), - (1229364, 5, 10000), - (1231158, 5, 10000), - (1230520, 5, 10000), - (1221798, 5, 10000) - ] - ); - - tmpdir.close().unwrap(); - } - #[tokio::test] async fn verify_parquet_strong_compression() { let tmpdir = camino_tempfile::tempdir().unwrap(); diff --git a/proxy/src/http.rs b/proxy/src/http.rs index fee634f67f..c77d95f47d 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -35,14 +35,17 @@ pub fn new_client() -> ClientWithMiddleware { .build() } -pub(crate) fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware { +pub(crate) fn new_client_with_timeout( + request_timeout: Duration, + total_retry_duration: Duration, +) -> ClientWithMiddleware { let timeout_client = reqwest::ClientBuilder::new() - .timeout(default_timout) + .timeout(request_timeout) .build() .expect("Failed to create http client with timeout"); let retry_policy = - ExponentialBackoff::builder().build_with_total_retry_duration(default_timout); + ExponentialBackoff::builder().build_with_total_retry_duration(total_retry_duration); reqwest_middleware::ClientBuilder::new(timeout_client) .with(reqwest_tracing::TracingMiddleware::default()) diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index ccef88231b..2da7eac580 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -4,8 +4,8 @@ use lasso::ThreadedRodeo; use measured::{ label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet}, metric::{histogram::Thresholds, name::MetricName}, - Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec, - LabelGroup, MetricGroup, + Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup, + MetricGroup, }; use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec}; @@ -548,6 +548,7 @@ pub enum RedisEventsCount { } pub struct ThreadPoolWorkers(usize); +#[derive(Copy, Clone)] pub struct ThreadPoolWorkerId(pub usize); impl LabelValue for ThreadPoolWorkerId { @@ -613,9 +614,6 @@ impl FixedCardinalitySet for ThreadPoolWorkers { #[derive(MetricGroup)] #[metric(new(workers: usize))] pub struct ThreadPoolMetrics { - pub injector_queue_depth: Gauge, - #[metric(init = GaugeVec::with_label_set(ThreadPoolWorkers(workers)))] - pub worker_queue_depth: GaugeVec, #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))] pub worker_task_turns_total: CounterVec, #[metric(init = CounterVec::with_label_set(ThreadPoolWorkers(workers)))] diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs index e5f5867998..6e38f89458 100644 --- a/proxy/src/rate_limiter.rs +++ b/proxy/src/rate_limiter.rs @@ -10,7 +10,5 @@ pub(crate) use limit_algorithm::{ }; pub(crate) use limiter::GlobalRateLimiter; -pub use leaky_bucket::{ - EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter, LeakyBucketState, -}; +pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter}; pub use limiter::{BucketRateLimiter, RateBucketInfo, WakeComputeRateLimiter}; diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs index fa8cb75256..bf4d85f2e4 100644 --- a/proxy/src/rate_limiter/leaky_bucket.rs +++ b/proxy/src/rate_limiter/leaky_bucket.rs @@ -8,6 +8,7 @@ use dashmap::DashMap; use rand::{thread_rng, Rng}; use tokio::time::Instant; use tracing::info; +use utils::leaky_bucket::LeakyBucketState; use crate::intern::EndpointIdInt; @@ -16,7 +17,7 @@ pub type EndpointRateLimiter = LeakyBucketRateLimiter; pub struct LeakyBucketRateLimiter { map: DashMap, - config: LeakyBucketConfig, + config: utils::leaky_bucket::LeakyBucketConfig, access_count: AtomicUsize, } @@ -29,7 +30,7 @@ impl LeakyBucketRateLimiter { pub fn new_with_shards(config: LeakyBucketConfig, shards: usize) -> Self { Self { map: DashMap::with_hasher_and_shard_amount(RandomState::new(), shards), - config, + config: config.into(), access_count: AtomicUsize::new(0), } } @@ -42,12 +43,12 @@ impl LeakyBucketRateLimiter { self.do_gc(now); } - let mut entry = self.map.entry(key).or_insert_with(|| LeakyBucketState { - time: now, - filled: 0.0, - }); + let mut entry = self + .map + .entry(key) + .or_insert_with(|| LeakyBucketState { empty_at: now }); - entry.check(&self.config, now, n as f64) + entry.add_tokens(&self.config, now, n as f64).is_ok() } fn do_gc(&self, now: Instant) { @@ -59,7 +60,7 @@ impl LeakyBucketRateLimiter { let shard = thread_rng().gen_range(0..n); self.map.shards()[shard] .write() - .retain(|_, value| !value.get_mut().update(&self.config, now)); + .retain(|_, value| !value.get().bucket_is_empty(now)); } } @@ -68,11 +69,6 @@ pub struct LeakyBucketConfig { pub max: f64, } -pub struct LeakyBucketState { - filled: f64, - time: Instant, -} - #[cfg(test)] impl LeakyBucketConfig { pub(crate) fn new(rps: f64, max: f64) -> Self { @@ -82,40 +78,9 @@ impl LeakyBucketConfig { } } -impl LeakyBucketState { - pub(crate) fn new() -> Self { - Self { - filled: 0.0, - time: Instant::now(), - } - } - - /// updates the timer and returns true if the bucket is empty - fn update(&mut self, info: &LeakyBucketConfig, now: Instant) -> bool { - let drain = now.duration_since(self.time); - let drain = drain.as_secs_f64() * info.rps; - - self.filled = (self.filled - drain).clamp(0.0, info.max); - self.time = now; - - self.filled == 0.0 - } - - pub(crate) fn check(&mut self, info: &LeakyBucketConfig, now: Instant, n: f64) -> bool { - self.update(info, now); - - if self.filled + n > info.max { - return false; - } - self.filled += n; - - true - } -} - -impl Default for LeakyBucketState { - fn default() -> Self { - Self::new() +impl From for utils::leaky_bucket::LeakyBucketConfig { + fn from(config: LeakyBucketConfig) -> Self { + utils::leaky_bucket::LeakyBucketConfig::new(config.rps, config.max) } } @@ -125,48 +90,55 @@ mod tests { use std::time::Duration; use tokio::time::Instant; + use utils::leaky_bucket::LeakyBucketState; - use super::{LeakyBucketConfig, LeakyBucketState}; + use super::LeakyBucketConfig; #[tokio::test(start_paused = true)] async fn check() { - let info = LeakyBucketConfig::new(500.0, 2000.0); - let mut bucket = LeakyBucketState::new(); + let config: utils::leaky_bucket::LeakyBucketConfig = + LeakyBucketConfig::new(500.0, 2000.0).into(); + assert_eq!(config.cost, Duration::from_millis(2)); + assert_eq!(config.bucket_width, Duration::from_secs(4)); + + let mut bucket = LeakyBucketState { + empty_at: Instant::now(), + }; // should work for 2000 requests this second for _ in 0..2000 { - assert!(bucket.check(&info, Instant::now(), 1.0)); + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); } - assert!(!bucket.check(&info, Instant::now(), 1.0)); - assert_eq!(bucket.filled, 2000.0); + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); + assert_eq!(bucket.empty_at - Instant::now(), config.bucket_width); // in 1ms we should drain 0.5 tokens. // make sure we don't lose any tokens tokio::time::advance(Duration::from_millis(1)).await; - assert!(!bucket.check(&info, Instant::now(), 1.0)); + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); tokio::time::advance(Duration::from_millis(1)).await; - assert!(bucket.check(&info, Instant::now(), 1.0)); + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); // in 10ms we should drain 5 tokens tokio::time::advance(Duration::from_millis(10)).await; for _ in 0..5 { - assert!(bucket.check(&info, Instant::now(), 1.0)); + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); } - assert!(!bucket.check(&info, Instant::now(), 1.0)); + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); // in 10s we should drain 5000 tokens // but cap is only 2000 tokio::time::advance(Duration::from_secs(10)).await; for _ in 0..2000 { - assert!(bucket.check(&info, Instant::now(), 1.0)); + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); } - assert!(!bucket.check(&info, Instant::now(), 1.0)); + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap_err(); // should sustain 500rps for _ in 0..2000 { tokio::time::advance(Duration::from_millis(10)).await; for _ in 0..5 { - assert!(bucket.check(&info, Instant::now(), 1.0)); + bucket.add_tokens(&config, Instant::now(), 1.0).unwrap(); } } } diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs index 255694b33e..64ee0135e1 100644 --- a/proxy/src/scram/countmin.rs +++ b/proxy/src/scram/countmin.rs @@ -83,10 +83,10 @@ mod tests { let mut ids = vec![]; for _ in 0..n { - // number of insert operations - let n = rng.gen_range(1..100); // number to insert at once - let m = rng.gen_range(1..4096); + let n = rng.gen_range(1..4096); + // number of insert operations + let m = rng.gen_range(1..100); let id = uuid::Builder::from_random_bytes(rng.gen()).into_uuid(); ids.push((id, n, m)); @@ -102,17 +102,11 @@ mod tests { let mut ids2 = ids.clone(); while !ids2.is_empty() { ids2.shuffle(&mut rng); - - let mut i = 0; - while i < ids2.len() { - sketch.inc_and_return(&ids2[i].0, ids2[i].1); - ids2[i].2 -= 1; - if ids2[i].2 == 0 { - ids2.remove(i); - } else { - i += 1; - } - } + ids2.retain_mut(|id| { + sketch.inc_and_return(&id.0, id.1); + id.2 -= 1; + id.2 > 0 + }); } let mut within_p = 0; @@ -144,8 +138,8 @@ mod tests { // probably numbers are too small to truly represent the probabilities. assert_eq!(eval_precision(100, 4096.0, 0.90), 100); assert_eq!(eval_precision(1000, 4096.0, 0.90), 1000); - assert_eq!(eval_precision(100, 4096.0, 0.1), 98); - assert_eq!(eval_precision(1000, 4096.0, 0.1), 991); + assert_eq!(eval_precision(100, 4096.0, 0.1), 96); + assert_eq!(eval_precision(1000, 4096.0, 0.1), 988); } // returns memory usage in bytes, and the time complexity per insert. diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs index 7fdadc7038..786cbcaa19 100644 --- a/proxy/src/scram/exchange.rs +++ b/proxy/src/scram/exchange.rs @@ -86,8 +86,7 @@ async fn derive_client_key( ) -> ScramKey { let salted_password = pool .spawn_job(endpoint, Pbkdf2::start(password, salt, iterations)) - .await - .expect("job should not be cancelled"); + .await; let make_key = |name| { let key = Hmac::::new_from_slice(&salted_password) diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs index d5ed9002ad..4cf76c8452 100644 --- a/proxy/src/scram/pbkdf2.rs +++ b/proxy/src/scram/pbkdf2.rs @@ -75,7 +75,7 @@ mod tests { let salt = b"sodium chloride"; let pass = b"Ne0n_!5_50_C007"; - let mut job = Pbkdf2::start(pass, salt, 600000); + let mut job = Pbkdf2::start(pass, salt, 60000); let hash = loop { let std::task::Poll::Ready(hash) = job.turn() else { continue; @@ -83,7 +83,7 @@ mod tests { break hash; }; - let expected = pbkdf2_hmac_array::(pass, salt, 600000); + let expected = pbkdf2_hmac_array::(pass, salt, 60000); assert_eq!(hash, expected); } } diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs index 262c6d146e..2702aeebfe 100644 --- a/proxy/src/scram/threadpool.rs +++ b/proxy/src/scram/threadpool.rs @@ -4,17 +4,20 @@ //! 1. Fairness per endpoint. //! 2. Yield support for high iteration counts. -use std::sync::{ - atomic::{AtomicU64, Ordering}, - Arc, +use std::{ + cell::RefCell, + future::Future, + pin::Pin, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, Weak, + }, + task::{Context, Poll}, }; -use crossbeam_deque::{Injector, Stealer, Worker}; -use itertools::Itertools; -use parking_lot::{Condvar, Mutex}; +use futures::FutureExt; use rand::Rng; use rand::{rngs::SmallRng, SeedableRng}; -use tokio::sync::oneshot; use crate::{ intern::EndpointIdInt, @@ -25,273 +28,164 @@ use crate::{ use super::pbkdf2::Pbkdf2; pub struct ThreadPool { - queue: Injector, - stealers: Vec>, - parkers: Vec<(Condvar, Mutex)>, - /// bitpacked representation. - /// lower 8 bits = number of sleeping threads - /// next 8 bits = number of idle threads (searching for work) - counters: AtomicU64, - + runtime: Option, pub metrics: Arc, } -#[derive(PartialEq)] -enum ThreadState { - Parked, - Active, +/// How often to reset the sketch values +const SKETCH_RESET_INTERVAL: u64 = 1021; + +thread_local! { + static STATE: RefCell> = const { RefCell::new(None) }; } impl ThreadPool { pub fn new(n_workers: u8) -> Arc { - let workers = (0..n_workers).map(|_| Worker::new_fifo()).collect_vec(); - let stealers = workers.iter().map(|w| w.stealer()).collect_vec(); + // rayon would be nice here, but yielding in rayon does not work well afaict. - let parkers = (0..n_workers) - .map(|_| (Condvar::new(), Mutex::new(ThreadState::Active))) - .collect_vec(); + Arc::new_cyclic(|pool| { + let pool = pool.clone(); + let worker_id = AtomicUsize::new(0); - let pool = Arc::new(Self { - queue: Injector::new(), - stealers, - parkers, - // threads start searching for work - counters: AtomicU64::new((n_workers as u64) << 8), - metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)), - }); + let runtime = tokio::runtime::Builder::new_multi_thread() + .worker_threads(n_workers as usize) + .on_thread_start(move || { + STATE.with_borrow_mut(|state| { + *state = Some(ThreadRt { + pool: pool.clone(), + id: ThreadPoolWorkerId(worker_id.fetch_add(1, Ordering::Relaxed)), + rng: SmallRng::from_entropy(), + // used to determine whether we should temporarily skip tasks for fairness. + // 99% of estimates will overcount by no more than 4096 samples + countmin: CountMinSketch::with_params( + 1.0 / (SKETCH_RESET_INTERVAL as f64), + 0.01, + ), + tick: 0, + }); + }); + }) + .build() + .unwrap(); - for (i, worker) in workers.into_iter().enumerate() { - let pool = Arc::clone(&pool); - std::thread::spawn(move || thread_rt(pool, worker, i)); - } - - pool + Self { + runtime: Some(runtime), + metrics: Arc::new(ThreadPoolMetrics::new(n_workers as usize)), + } + }) } - pub(crate) fn spawn_job( - &self, - endpoint: EndpointIdInt, - pbkdf2: Pbkdf2, - ) -> oneshot::Receiver<[u8; 32]> { - let (tx, rx) = oneshot::channel(); - - let queue_was_empty = self.queue.is_empty(); - - self.metrics.injector_queue_depth.inc(); - self.queue.push(JobSpec { - response: tx, - pbkdf2, - endpoint, - }); - - // inspired from - let counts = self.counters.load(Ordering::SeqCst); - let num_awake_but_idle = (counts >> 8) & 0xff; - let num_sleepers = counts & 0xff; - - // If the queue is non-empty, then we always wake up a worker - // -- clearly the existing idle jobs aren't enough. Otherwise, - // check to see if we have enough idle workers. - if !queue_was_empty || num_awake_but_idle == 0 { - let num_to_wake = Ord::min(1, num_sleepers); - self.wake_any_threads(num_to_wake); - } - - rx - } - - #[cold] - fn wake_any_threads(&self, mut num_to_wake: u64) { - if num_to_wake > 0 { - for i in 0..self.parkers.len() { - if self.wake_specific_thread(i) { - num_to_wake -= 1; - if num_to_wake == 0 { - return; - } - } - } - } - } - - fn wake_specific_thread(&self, index: usize) -> bool { - let (condvar, lock) = &self.parkers[index]; - - let mut state = lock.lock(); - if *state == ThreadState::Parked { - condvar.notify_one(); - - // When the thread went to sleep, it will have incremented - // this value. When we wake it, its our job to decrement - // it. We could have the thread do it, but that would - // introduce a delay between when the thread was - // *notified* and when this counter was decremented. That - // might mislead people with new work into thinking that - // there are sleeping threads that they should try to - // wake, when in fact there is nothing left for them to - // do. - self.counters.fetch_sub(1, Ordering::SeqCst); - *state = ThreadState::Active; - - true - } else { - false - } - } - - fn steal(&self, rng: &mut impl Rng, skip: usize, worker: &Worker) -> Option { - // announce thread as idle - self.counters.fetch_add(256, Ordering::SeqCst); - - // try steal from the global queue - loop { - match self.queue.steal_batch_and_pop(worker) { - crossbeam_deque::Steal::Success(job) => { - self.metrics - .injector_queue_depth - .set(self.queue.len() as i64); - // no longer idle - self.counters.fetch_sub(256, Ordering::SeqCst); - return Some(job); - } - crossbeam_deque::Steal::Retry => continue, - crossbeam_deque::Steal::Empty => break, - } - } - - // try steal from our neighbours - loop { - let mut retry = false; - let start = rng.gen_range(0..self.stealers.len()); - let job = (start..self.stealers.len()) - .chain(0..start) - .filter(|i| *i != skip) - .find_map( - |victim| match self.stealers[victim].steal_batch_and_pop(worker) { - crossbeam_deque::Steal::Success(job) => Some(job), - crossbeam_deque::Steal::Empty => None, - crossbeam_deque::Steal::Retry => { - retry = true; - None - } - }, - ); - if job.is_some() { - // no longer idle - self.counters.fetch_sub(256, Ordering::SeqCst); - return job; - } - if !retry { - return None; - } - } + pub(crate) fn spawn_job(&self, endpoint: EndpointIdInt, pbkdf2: Pbkdf2) -> JobHandle { + JobHandle( + self.runtime + .as_ref() + .unwrap() + .spawn(JobSpec { pbkdf2, endpoint }), + ) } } -fn thread_rt(pool: Arc, worker: Worker, index: usize) { - /// interval when we should steal from the global queue - /// so that tail latencies are managed appropriately - const STEAL_INTERVAL: usize = 61; +impl Drop for ThreadPool { + fn drop(&mut self) { + self.runtime.take().unwrap().shutdown_background(); + } +} - /// How often to reset the sketch values - const SKETCH_RESET_INTERVAL: usize = 1021; +struct ThreadRt { + pool: Weak, + id: ThreadPoolWorkerId, + rng: SmallRng, + countmin: CountMinSketch, + tick: u64, +} - let mut rng = SmallRng::from_entropy(); +impl ThreadRt { + fn should_run(&mut self, job: &JobSpec) -> bool { + let rate = self + .countmin + .inc_and_return(&job.endpoint, job.pbkdf2.cost()); - // used to determine whether we should temporarily skip tasks for fairness. - // 99% of estimates will overcount by no more than 4096 samples - let mut sketch = CountMinSketch::with_params(1.0 / (SKETCH_RESET_INTERVAL as f64), 0.01); - - let (condvar, lock) = &pool.parkers[index]; - - 'wait: loop { - // wait for notification of work - { - let mut lock = lock.lock(); - - // queue is empty - pool.metrics - .worker_queue_depth - .set(ThreadPoolWorkerId(index), 0); - - // subtract 1 from idle count, add 1 to sleeping count. - pool.counters.fetch_sub(255, Ordering::SeqCst); - - *lock = ThreadState::Parked; - condvar.wait(&mut lock); - } - - for i in 0.. { - let Some(mut job) = worker - .pop() - .or_else(|| pool.steal(&mut rng, index, &worker)) - else { - continue 'wait; - }; - - pool.metrics - .worker_queue_depth - .set(ThreadPoolWorkerId(index), worker.len() as i64); - - // receiver is closed, cancel the task - if !job.response.is_closed() { - let rate = sketch.inc_and_return(&job.endpoint, job.pbkdf2.cost()); - - const P: f64 = 2000.0; - // probability decreases as rate increases. - // lower probability, higher chance of being skipped - // - // estimates (rate in terms of 4096 rounds): - // rate = 0 => probability = 100% - // rate = 10 => probability = 71.3% - // rate = 50 => probability = 62.1% - // rate = 500 => probability = 52.3% - // rate = 1021 => probability = 49.8% - // - // My expectation is that the pool queue will only begin backing up at ~1000rps - // in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above - // are in requests per second. - let probability = P.ln() / (P + rate as f64).ln(); - if pool.queue.len() > 32 || rng.gen_bool(probability) { - pool.metrics - .worker_task_turns_total - .inc(ThreadPoolWorkerId(index)); - - match job.pbkdf2.turn() { - std::task::Poll::Ready(result) => { - let _ = job.response.send(result); - } - std::task::Poll::Pending => worker.push(job), - } - } else { - pool.metrics - .worker_task_skips_total - .inc(ThreadPoolWorkerId(index)); - - // skip for now - worker.push(job); - } - } - - // if we get stuck with a few long lived jobs in the queue - // it's better to try and steal from the queue too for fairness - if i % STEAL_INTERVAL == 0 { - let _ = pool.queue.steal_batch(&worker); - } - - if i % SKETCH_RESET_INTERVAL == 0 { - sketch.reset(); - } - } + const P: f64 = 2000.0; + // probability decreases as rate increases. + // lower probability, higher chance of being skipped + // + // estimates (rate in terms of 4096 rounds): + // rate = 0 => probability = 100% + // rate = 10 => probability = 71.3% + // rate = 50 => probability = 62.1% + // rate = 500 => probability = 52.3% + // rate = 1021 => probability = 49.8% + // + // My expectation is that the pool queue will only begin backing up at ~1000rps + // in which case the SKETCH_RESET_INTERVAL represents 1 second. Thus, the rates above + // are in requests per second. + let probability = P.ln() / (P + rate as f64).ln(); + self.rng.gen_bool(probability) } } struct JobSpec { - response: oneshot::Sender<[u8; 32]>, pbkdf2: Pbkdf2, endpoint: EndpointIdInt, } +impl Future for JobSpec { + type Output = [u8; 32]; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + STATE.with_borrow_mut(|state| { + let state = state.as_mut().expect("should be set on thread startup"); + + state.tick = state.tick.wrapping_add(1); + if state.tick % SKETCH_RESET_INTERVAL == 0 { + state.countmin.reset(); + } + + if state.should_run(&self) { + if let Some(pool) = state.pool.upgrade() { + pool.metrics.worker_task_turns_total.inc(state.id); + } + + match self.pbkdf2.turn() { + Poll::Ready(result) => Poll::Ready(result), + // more to do, we shall requeue + Poll::Pending => { + cx.waker().wake_by_ref(); + Poll::Pending + } + } + } else { + if let Some(pool) = state.pool.upgrade() { + pool.metrics.worker_task_skips_total.inc(state.id); + } + + cx.waker().wake_by_ref(); + Poll::Pending + } + }) + } +} + +pub(crate) struct JobHandle(tokio::task::JoinHandle<[u8; 32]>); + +impl Future for JobHandle { + type Output = [u8; 32]; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + match self.0.poll_unpin(cx) { + Poll::Ready(Ok(ok)) => Poll::Ready(ok), + Poll::Ready(Err(err)) => std::panic::resume_unwind(err.into_panic()), + Poll::Pending => Poll::Pending, + } + } +} + +impl Drop for JobHandle { + fn drop(&mut self) { + self.0.abort(); + } +} + #[cfg(test)] mod tests { use crate::EndpointId; @@ -308,8 +202,7 @@ mod tests { let salt = [0x55; 32]; let actual = pool .spawn_job(ep, Pbkdf2::start(b"password", &salt, 4096)) - .await - .unwrap(); + .await; let expected = [ 10, 114, 73, 188, 140, 222, 196, 156, 214, 184, 79, 157, 119, 242, 16, 31, 53, 242, diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs index aa8c7ba319..fd8599bcb3 100644 --- a/proxy/src/usage_metrics.rs +++ b/proxy/src/usage_metrics.rs @@ -33,7 +33,8 @@ use uuid::{NoContext, Timestamp}; const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; -const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); +const HTTP_REPORTING_REQUEST_TIMEOUT: Duration = Duration::from_secs(10); +const HTTP_REPORTING_RETRY_DURATION: Duration = Duration::from_secs(60); /// Key that uniquely identifies the object, this metric describes. /// Currently, endpoint_id is enough, but this may change later, @@ -223,7 +224,10 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result anyhow::Result<()> { } /// Subscribe and fetch all the interesting data from the broker. -#[instrument(name = "broker pull", skip_all)] +#[instrument(name = "broker_pull", skip_all)] async fn pull_loop(conf: SafeKeeperConf, stats: Arc) -> Result<()> { let mut client = storage_broker::connect(conf.broker_endpoint, conf.broker_keepalive_interval)?; diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs index c551cd3122..8b252b4ab4 100644 --- a/safekeeper/src/control_file.rs +++ b/safekeeper/src/control_file.rs @@ -7,6 +7,7 @@ use tokio::fs::File; use tokio::io::AsyncWriteExt; use utils::crashsafe::durable_rename; +use std::future::Future; use std::io::Read; use std::ops::Deref; use std::path::Path; @@ -31,10 +32,9 @@ pub const CHECKSUM_SIZE: usize = size_of::(); /// Storage should keep actual state inside of it. It should implement Deref /// trait to access state fields and have persist method for updating that state. -#[async_trait::async_trait] pub trait Storage: Deref { /// Persist safekeeper state on disk and update internal state. - async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()>; + fn persist(&mut self, s: &TimelinePersistentState) -> impl Future> + Send; /// Timestamp of last persist. fn last_persist_at(&self) -> Instant; @@ -188,7 +188,6 @@ impl TimelinePersistentState { } } -#[async_trait::async_trait] impl Storage for FileStorage { /// Persists state durably to the underlying storage. /// diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 91ffa95c21..9b7424a818 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -389,6 +389,25 @@ async fn timeline_digest_handler(request: Request) -> Result) -> Result, ApiError> { + let ttid = TenantTimelineId::new( + parse_request_param(&request, "tenant_id")?, + parse_request_param(&request, "timeline_id")?, + ); + check_permission(&request, Some(ttid.tenant_id))?; + + let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?; + + let response = tli + .backup_partial_reset() + .await + .map_err(ApiError::InternalServerError)?; + json_response(StatusCode::OK, response) +} + /// Used only in tests to hand craft required data. async fn record_safekeeper_info(mut request: Request) -> Result, ApiError> { let ttid = TenantTimelineId::new( @@ -607,6 +626,10 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder .get("/v1/tenant/:tenant_id/timeline/:timeline_id/digest", |r| { request_span(r, timeline_digest_handler) }) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/backup_partial_reset", + |r| request_span(r, timeline_backup_partial_reset), + ) .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| { request_span(r, record_safekeeper_info) }) diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 1eacec9981..600a6bd8f0 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -183,10 +183,10 @@ impl WalResidentTimeline { "Replacing uploaded partial segment in in-mem control file: {replace:?}" ); - let remote_timeline_path = wal_backup::remote_timeline_path(&self.tli.ttid)?; + let remote_timeline_path = &self.tli.remote_path; wal_backup::copy_partial_segment( - &replace.previous.remote_path(&remote_timeline_path), - &replace.current.remote_path(&remote_timeline_path), + &replace.previous.remote_path(remote_timeline_path), + &replace.current.remote_path(remote_timeline_path), ) .await?; } diff --git a/safekeeper/src/recovery.rs b/safekeeper/src/recovery.rs index a59ff07b96..9c4149d8f1 100644 --- a/safekeeper/src/recovery.rs +++ b/safekeeper/src/recovery.rs @@ -35,7 +35,7 @@ use crate::{ /// Entrypoint for per timeline task which always runs, checking whether /// recovery for this safekeeper is needed and starting it if so. -#[instrument(name = "recovery task", skip_all, fields(ttid = %tli.ttid))] +#[instrument(name = "recovery", skip_all, fields(ttid = %tli.ttid))] pub async fn recovery_main(tli: WalResidentTimeline, conf: SafeKeeperConf) { info!("started"); diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 0814d9ba67..dbe0034de2 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -875,6 +875,29 @@ where return Ok(Some(AcceptorProposerMessage::AppendResponse(resp))); } + // Disallow any non-sequential writes, which can result in gaps or + // overwrites. If we need to move the pointer, ProposerElected message + // should have truncated WAL first accordingly. Note that the first + // condition (WAL rewrite) is quite expected in real world; it happens + // when walproposer reconnects to safekeeper and writes some more data + // while first connection still gets some packets later. It might be + // better to not log this as error! above. + let write_lsn = self.wal_store.write_lsn(); + if write_lsn > msg.h.begin_lsn { + bail!( + "append request rewrites WAL written before, write_lsn={}, msg lsn={}", + write_lsn, + msg.h.begin_lsn + ); + } + if write_lsn < msg.h.begin_lsn && write_lsn != Lsn(0) { + bail!( + "append request creates gap in written WAL, write_lsn={}, msg lsn={}", + write_lsn, + msg.h.begin_lsn, + ); + } + // Now we know that we are in the same term as the proposer, // processing the message. @@ -960,10 +983,7 @@ mod tests { use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE}; use super::*; - use crate::{ - state::{EvictionState, PersistedPeers, TimelinePersistentState}, - wal_storage::Storage, - }; + use crate::state::{EvictionState, PersistedPeers, TimelinePersistentState}; use std::{ops::Deref, str::FromStr, time::Instant}; // fake storage for tests @@ -971,7 +991,6 @@ mod tests { persisted_state: TimelinePersistentState, } - #[async_trait::async_trait] impl control_file::Storage for InMemoryState { async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> { self.persisted_state = s.clone(); @@ -1003,8 +1022,11 @@ mod tests { lsn: Lsn, } - #[async_trait::async_trait] impl wal_storage::Storage for DummyWalStore { + fn write_lsn(&self) -> Lsn { + self.lsn + } + fn flush_lsn(&self) -> Lsn { self.lsn } @@ -1078,7 +1100,7 @@ mod tests { let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); let mut ar_hdr = AppendRequestHeader { - term: 1, + term: 2, term_start_lsn: Lsn(3), begin_lsn: Lsn(1), end_lsn: Lsn(2), @@ -1092,24 +1114,29 @@ mod tests { }; let pem = ProposerElected { - term: 1, - start_streaming_at: Lsn(3), - term_history: TermHistory(vec![TermLsn { - term: 1, - lsn: Lsn(3), - }]), - timeline_start_lsn: Lsn(0), + term: 2, + start_streaming_at: Lsn(1), + term_history: TermHistory(vec![ + TermLsn { + term: 1, + lsn: Lsn(1), + }, + TermLsn { + term: 2, + lsn: Lsn(3), + }, + ]), + timeline_start_lsn: Lsn(1), }; sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) .await .unwrap(); // check that AppendRequest before term_start_lsn doesn't switch last_log_term. - let resp = sk - .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) - .await; - assert!(resp.is_ok()); - assert_eq!(sk.get_last_log_term(), 0); + sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) + .await + .unwrap(); + assert_eq!(sk.get_last_log_term(), 1); // but record at term_start_lsn does the switch ar_hdr.begin_lsn = Lsn(2); @@ -1118,12 +1145,63 @@ mod tests { h: ar_hdr, wal_data: Bytes::from_static(b"b"), }; - let resp = sk - .process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) - .await; - assert!(resp.is_ok()); - sk.wal_store.truncate_wal(Lsn(3)).await.unwrap(); // imitate the complete record at 3 %) - assert_eq!(sk.get_last_log_term(), 1); + sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) + .await + .unwrap(); + assert_eq!(sk.get_last_log_term(), 2); + } + + #[tokio::test] + async fn test_non_consecutive_write() { + let storage = InMemoryState { + persisted_state: test_sk_state(), + }; + let wal_store = DummyWalStore { lsn: Lsn(0) }; + + let mut sk = SafeKeeper::new(TimelineState::new(storage), wal_store, NodeId(0)).unwrap(); + + let pem = ProposerElected { + term: 1, + start_streaming_at: Lsn(1), + term_history: TermHistory(vec![TermLsn { + term: 1, + lsn: Lsn(1), + }]), + timeline_start_lsn: Lsn(1), + }; + sk.process_msg(&ProposerAcceptorMessage::Elected(pem)) + .await + .unwrap(); + + let ar_hdr = AppendRequestHeader { + term: 1, + term_start_lsn: Lsn(3), + begin_lsn: Lsn(1), + end_lsn: Lsn(2), + commit_lsn: Lsn(0), + truncate_lsn: Lsn(0), + proposer_uuid: [0; 16], + }; + let append_request = AppendRequest { + h: ar_hdr.clone(), + wal_data: Bytes::from_static(b"b"), + }; + + // do write ending at 2, it should be ok + sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) + .await + .unwrap(); + let mut ar_hrd2 = ar_hdr.clone(); + ar_hrd2.begin_lsn = Lsn(4); + ar_hrd2.end_lsn = Lsn(5); + let append_request = AppendRequest { + h: ar_hdr, + wal_data: Bytes::from_static(b"b"), + }; + // and now starting at 4, it must fail + sk.process_msg(&ProposerAcceptorMessage::AppendRequest(append_request)) + .await + .unwrap_err(); } #[test] diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index 57935d879f..95ee925e1a 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -3,6 +3,7 @@ use anyhow::{anyhow, bail, Result}; use camino::Utf8PathBuf; +use remote_storage::RemotePath; use serde::{Deserialize, Serialize}; use tokio::fs::{self}; use tokio_util::sync::CancellationToken; @@ -36,7 +37,7 @@ use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, Tim use crate::timeline_guard::ResidenceGuard; use crate::timeline_manager::{AtomicStatus, ManagerCtl}; use crate::timelines_set::TimelinesSet; -use crate::wal_backup::{self}; +use crate::wal_backup::{self, remote_timeline_path}; use crate::wal_backup_partial::PartialRemoteSegment; use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; @@ -469,6 +470,7 @@ impl From for ApiError { /// It also holds SharedState and provides mutually exclusive access to it. pub struct Timeline { pub ttid: TenantTimelineId, + pub remote_path: RemotePath, /// Used to broadcast commit_lsn updates to all background jobs. commit_lsn_watch_tx: watch::Sender, @@ -519,8 +521,10 @@ impl Timeline { let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0); let walreceivers = WalReceivers::new(); + let remote_path = remote_timeline_path(&ttid)?; Ok(Timeline { ttid, + remote_path, commit_lsn_watch_tx, commit_lsn_watch_rx, term_flush_lsn_watch_tx, @@ -557,8 +561,10 @@ impl Timeline { TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn); let walreceivers = WalReceivers::new(); + let remote_path = remote_timeline_path(&ttid)?; Ok(Timeline { ttid, + remote_path, commit_lsn_watch_tx, commit_lsn_watch_rx, term_flush_lsn_watch_tx, @@ -902,6 +908,10 @@ impl Timeline { Ok(WalResidentTimeline::new(self.clone(), guard)) } + + pub async fn backup_partial_reset(self: &Arc) -> Result> { + self.manager_ctl.backup_partial_reset().await + } } /// This is a guard that allows to read/write disk timeline state. diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index ae6f3f4b7e..5d0567575c 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -28,28 +28,38 @@ impl Manager { /// - control file is flushed (no next event scheduled) /// - no WAL residence guards /// - no pushes to the broker - /// - partial WAL backup is uploaded + /// - last partial WAL segment is uploaded + /// - all local segments before the uploaded partial are committed and uploaded pub(crate) fn ready_for_eviction( &self, next_event: &Option, state: &StateSnapshot, ) -> bool { - self.backup_task.is_none() + let ready = self.backup_task.is_none() && self.recovery_task.is_none() && self.wal_removal_task.is_none() && self.partial_backup_task.is_none() - && self.partial_backup_uploaded.is_some() && next_event.is_none() && self.access_service.is_empty() && !self.tli_broker_active.get() + // Partial segment of current flush_lsn is uploaded up to this flush_lsn. && !wal_backup_partial::needs_uploading(state, &self.partial_backup_uploaded) + // And it is the next one after the last removed. Given that local + // WAL is removed only after it is uploaded to s3 (and pageserver + // advancing remote_consistent_lsn) which happens only after WAL is + // committed, true means all this is done. + // + // This also works for the first segment despite last_removed_segno + // being 0 on init because this 0 triggers run of wal_removal_task + // on success of which manager updates the horizon. && self .partial_backup_uploaded .as_ref() .unwrap() .flush_lsn .segment_number(self.wal_seg_size) - == self.last_removed_segno + 1 + == self.last_removed_segno + 1; + ready } /// Evict the timeline to remote storage. @@ -83,7 +93,8 @@ impl Manager { info!("successfully evicted timeline"); } - /// Restore evicted timeline from remote storage. + /// Attempt to restore evicted timeline from remote storage; it must be + /// offloaded. #[instrument(name = "unevict_timeline", skip_all)] pub(crate) async fn unevict_timeline(&mut self) { assert!(self.is_offloaded); @@ -167,7 +178,7 @@ async fn redownload_partial_segment( partial: &PartialRemoteSegment, ) -> anyhow::Result<()> { let tmp_file = mgr.tli.timeline_dir().join("remote_partial.tmp"); - let remote_segfile = remote_segment_path(mgr, partial)?; + let remote_segfile = remote_segment_path(mgr, partial); debug!( "redownloading partial segment: {} -> {}", @@ -252,7 +263,7 @@ async fn do_validation( ); } - let remote_segfile = remote_segment_path(mgr, partial)?; + let remote_segfile = remote_segment_path(mgr, partial); let mut remote_reader: std::pin::Pin> = wal_backup::read_object(&remote_segfile, 0).await?; @@ -279,12 +290,8 @@ fn local_segment_path(mgr: &Manager, partial: &PartialRemoteSegment) -> Utf8Path local_partial_segfile } -fn remote_segment_path( - mgr: &Manager, - partial: &PartialRemoteSegment, -) -> anyhow::Result { - let remote_timeline_path = wal_backup::remote_timeline_path(&mgr.tli.ttid)?; - Ok(partial.remote_path(&remote_timeline_path)) +fn remote_segment_path(mgr: &Manager, partial: &PartialRemoteSegment) -> RemotePath { + partial.remote_path(&mgr.tli.remote_path) } /// Compare first `n` bytes of two readers. If the bytes differ, return an error. diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index 482614fac7..f997f48454 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -11,12 +11,14 @@ use std::{ time::Duration, }; +use futures::channel::oneshot; use postgres_ffi::XLogSegNo; use serde::{Deserialize, Serialize}; use tokio::{ task::{JoinError, JoinHandle}, time::Instant, }; +use tokio_util::sync::CancellationToken; use tracing::{debug, info, info_span, instrument, warn, Instrument}; use utils::lsn::Lsn; @@ -33,7 +35,7 @@ use crate::{ timeline_guard::{AccessService, GuardId, ResidenceGuard}, timelines_set::{TimelineSetGuard, TimelinesSet}, wal_backup::{self, WalBackupTaskHandle}, - wal_backup_partial::{self, PartialRemoteSegment}, + wal_backup_partial::{self, PartialBackup, PartialRemoteSegment}, SafeKeeperConf, }; @@ -96,6 +98,8 @@ pub enum ManagerCtlMessage { GuardRequest(tokio::sync::oneshot::Sender>), /// Request to drop the guard. GuardDrop(GuardId), + /// Request to reset uploaded partial backup state. + BackupPartialReset(oneshot::Sender>>), } impl std::fmt::Debug for ManagerCtlMessage { @@ -103,6 +107,7 @@ impl std::fmt::Debug for ManagerCtlMessage { match self { ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"), ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id), + ManagerCtlMessage::BackupPartialReset(_) => write!(f, "BackupPartialReset"), } } } @@ -143,6 +148,19 @@ impl ManagerCtl { .and_then(std::convert::identity) } + /// Request timeline manager to reset uploaded partial segment state and + /// wait for the result. + pub async fn backup_partial_reset(&self) -> anyhow::Result> { + let (tx, rx) = oneshot::channel(); + self.manager_tx + .send(ManagerCtlMessage::BackupPartialReset(tx)) + .expect("manager task is not running"); + match rx.await { + Ok(res) => res, + Err(_) => anyhow::bail!("timeline manager is gone"), + } + } + /// Must be called exactly once to bootstrap the manager. pub fn bootstrap_manager( &self, @@ -181,7 +199,8 @@ pub(crate) struct Manager { pub(crate) wal_removal_task: Option>>, // partial backup - pub(crate) partial_backup_task: Option>>, + pub(crate) partial_backup_task: + Option<(JoinHandle>, CancellationToken)>, pub(crate) partial_backup_uploaded: Option, // misc @@ -302,12 +321,12 @@ pub async fn main_task( _ = sleep_until(&next_event) => { // we were waiting for some event (e.g. cfile save) } - res = await_task_finish(&mut mgr.wal_removal_task) => { + res = await_task_finish(mgr.wal_removal_task.as_mut()) => { // WAL removal task finished mgr.wal_removal_task = None; mgr.update_wal_removal_end(res); } - res = await_task_finish(&mut mgr.partial_backup_task) => { + res = await_task_finish(mgr.partial_backup_task.as_mut().map(|(handle, _)| handle)) => { // partial backup task finished mgr.partial_backup_task = None; mgr.update_partial_backup_end(res); @@ -335,8 +354,9 @@ pub async fn main_task( } } - if let Some(partial_backup_task) = &mut mgr.partial_backup_task { - if let Err(e) = partial_backup_task.await { + if let Some((handle, cancel)) = &mut mgr.partial_backup_task { + cancel.cancel(); + if let Err(e) = handle.await { warn!("partial backup task failed: {:?}", e); } } @@ -560,11 +580,14 @@ impl Manager { } // Get WalResidentTimeline and start partial backup task. - self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task( + let cancel = CancellationToken::new(); + let handle = tokio::spawn(wal_backup_partial::main_task( self.wal_resident_timeline(), self.conf.clone(), self.global_rate_limiter.clone(), - ))); + cancel.clone(), + )); + self.partial_backup_task = Some((handle, cancel)); } /// Update the state after partial WAL backup task finished. @@ -579,6 +602,39 @@ impl Manager { } } + /// Reset partial backup state and remove its remote storage data. Since it + /// might concurrently uploading something, cancel the task first. + async fn backup_partial_reset(&mut self) -> anyhow::Result> { + info!("resetting partial backup state"); + // Force unevict timeline if it is evicted before erasing partial backup + // state. The intended use of this function is to drop corrupted remote + // state; we haven't enabled local files deletion yet anywhere, + // so direct switch is safe. + if self.is_offloaded { + self.tli.switch_to_present().await?; + // switch manager state as soon as possible + self.is_offloaded = false; + } + + if let Some((handle, cancel)) = &mut self.partial_backup_task { + cancel.cancel(); + info!("cancelled partial backup task, awaiting it"); + // we're going to reset .partial_backup_uploaded to None anyway, so ignore the result + handle.await.ok(); + self.partial_backup_task = None; + } + + let tli = self.wal_resident_timeline(); + let mut partial_backup = PartialBackup::new(tli, self.conf.clone()).await; + // Reset might fail e.g. when cfile is already reset but s3 removal + // failed, so set manager state to None beforehand. In any case caller + // is expected to retry until success. + self.partial_backup_uploaded = None; + let res = partial_backup.reset().await?; + info!("reset is done"); + Ok(res) + } + /// Handle message arrived from ManagerCtl. async fn handle_message(&mut self, msg: Option) { debug!("received manager message: {:?}", msg); @@ -602,6 +658,16 @@ impl Manager { Some(ManagerCtlMessage::GuardDrop(guard_id)) => { self.access_service.drop_guard(guard_id); } + Some(ManagerCtlMessage::BackupPartialReset(tx)) => { + info!("resetting uploaded partial backup state"); + let res = self.backup_partial_reset().await; + if let Err(ref e) = res { + warn!("failed to reset partial backup state: {:?}", e); + } + if tx.send(res).is_err() { + warn!("failed to send partial backup reset result, receiver dropped"); + } + } None => { // can't happen, we're holding the sender unreachable!(); @@ -619,7 +685,11 @@ async fn sleep_until(option: &Option) { } } -async fn await_task_finish(option: &mut Option>) -> Result { +/// Future that resolves when the task is finished or never if the task is None. +/// +/// Note: it accepts Option<&mut> instead of &mut Option<> because mapping the +/// option to get the latter is hard. +async fn await_task_finish(option: Option<&mut JoinHandle>) -> Result { if let Some(task) = option { task.await } else { diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index aa1a6696a1..ef26ac99c5 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -203,7 +203,7 @@ struct WalBackupTask { } /// Offload single timeline. -#[instrument(name = "WAL backup", skip_all, fields(ttid = %tli.ttid))] +#[instrument(name = "wal_backup", skip_all, fields(ttid = %tli.ttid))] async fn backup_task_main( tli: WalResidentTimeline, parallel_jobs: usize, @@ -315,7 +315,7 @@ async fn backup_lsn_range( anyhow::bail!("parallel_jobs must be >= 1"); } - let remote_timeline_path = remote_timeline_path(&timeline.ttid)?; + let remote_timeline_path = &timeline.remote_path; let start_lsn = *backup_lsn; let segments = get_segments(start_lsn, end_lsn, wal_seg_size); @@ -328,11 +328,7 @@ async fn backup_lsn_range( loop { let added_task = match iter.next() { Some(s) => { - uploads.push_back(backup_single_segment( - s, - timeline_dir, - &remote_timeline_path, - )); + uploads.push_back(backup_single_segment(s, timeline_dir, remote_timeline_path)); true } None => false, diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 675a051887..4050a82fff 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -22,6 +22,7 @@ use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; use remote_storage::RemotePath; use serde::{Deserialize, Serialize}; +use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn}; use utils::{id::NodeId, lsn::Lsn}; @@ -31,7 +32,7 @@ use crate::{ safekeeper::Term, timeline::WalResidentTimeline, timeline_manager::StateSnapshot, - wal_backup::{self, remote_timeline_path}, + wal_backup::{self}, SafeKeeperConf, }; @@ -145,7 +146,7 @@ impl State { } } -struct PartialBackup { +pub struct PartialBackup { wal_seg_size: usize, tli: WalResidentTimeline, conf: SafeKeeperConf, @@ -155,8 +156,25 @@ struct PartialBackup { state: State, } -// Read-only methods for getting segment names impl PartialBackup { + pub async fn new(tli: WalResidentTimeline, conf: SafeKeeperConf) -> PartialBackup { + let (_, persistent_state) = tli.get_state().await; + let wal_seg_size = tli.get_wal_seg_size().await; + + let local_prefix = tli.get_timeline_dir(); + let remote_timeline_path = tli.remote_path.clone(); + + PartialBackup { + wal_seg_size, + tli, + state: persistent_state.partial_backup, + conf, + local_prefix, + remote_timeline_path, + } + } + + // Read-only methods for getting segment names fn segno(&self, lsn: Lsn) -> XLogSegNo { lsn.segment_number(self.wal_seg_size) } @@ -297,6 +315,18 @@ impl PartialBackup { Ok(()) } + // Prepend to the given segments remote prefix and delete them from the + // remote storage. + async fn delete_segments(&self, segments_to_delete: &Vec) -> anyhow::Result<()> { + info!("deleting objects: {:?}", segments_to_delete); + let mut objects_to_delete = vec![]; + for seg in segments_to_delete.iter() { + let remote_path = self.remote_timeline_path.join(seg); + objects_to_delete.push(remote_path); + } + wal_backup::delete_objects(&objects_to_delete).await + } + /// Delete all non-Uploaded segments from the remote storage. There should be only one /// Uploaded segment at a time. #[instrument(name = "gc", skip_all)] @@ -329,15 +359,8 @@ impl PartialBackup { ); } - info!("deleting objects: {:?}", segments_to_delete); - let mut objects_to_delete = vec![]; - for seg in segments_to_delete.iter() { - let remote_path = self.remote_timeline_path.join(seg); - objects_to_delete.push(remote_path); - } - - // removing segments from remote storage - wal_backup::delete_objects(&objects_to_delete).await?; + // execute the deletion + self.delete_segments(&segments_to_delete).await?; // now we can update the state on disk let new_state = { @@ -349,6 +372,27 @@ impl PartialBackup { Ok(()) } + + /// Remove uploaded segment(s) from the state and remote storage. Aimed for + /// manual intervention, not normally needed. + /// Returns list of segments which potentially existed in the remote storage. + pub async fn reset(&mut self) -> anyhow::Result> { + let segments_to_delete = self + .state + .segments + .iter() + .map(|seg| seg.name.clone()) + .collect(); + + // First reset cfile state, and only then objects themselves. If the + // later fails we might leave some garbage behind; that's ok for this + // single time usage. + let new_state = State { segments: vec![] }; + self.commit_state(new_state).await?; + + self.delete_segments(&segments_to_delete).await?; + Ok(segments_to_delete) + } } /// Check if everything is uploaded and partial backup task doesn't need to run. @@ -372,38 +416,21 @@ pub(crate) fn needs_uploading( /// /// When there is nothing more to do and the last segment was successfully uploaded, the task /// returns PartialRemoteSegment, to signal readiness for offloading the timeline. -#[instrument(name = "Partial backup", skip_all, fields(ttid = %tli.ttid))] +#[instrument(name = "partial_backup", skip_all, fields(ttid = %tli.ttid))] pub async fn main_task( tli: WalResidentTimeline, conf: SafeKeeperConf, limiter: RateLimiter, + cancel: CancellationToken, ) -> Option { debug!("started"); let await_duration = conf.partial_backup_timeout; let mut first_iteration = true; - let (_, persistent_state) = tli.get_state().await; let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx(); let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx(); - let wal_seg_size = tli.get_wal_seg_size().await; - let local_prefix = tli.get_timeline_dir(); - let remote_timeline_path = match remote_timeline_path(&tli.ttid) { - Ok(path) => path, - Err(e) => { - error!("failed to create remote path: {:?}", e); - return None; - } - }; - - let mut backup = PartialBackup { - wal_seg_size, - tli, - state: persistent_state.partial_backup, - conf, - local_prefix, - remote_timeline_path, - }; + let mut backup = PartialBackup::new(tli, conf).await; debug!("state: {:?}", backup.state); @@ -433,6 +460,10 @@ pub async fn main_task( && flush_lsn_rx.borrow().term == seg.term { // we have nothing to do, the last segment is already uploaded + debug!( + "exiting, uploaded up to term={} flush_lsn={} commit_lsn={}", + seg.term, seg.flush_lsn, seg.commit_lsn + ); return Some(seg.clone()); } } @@ -444,6 +475,10 @@ pub async fn main_task( info!("timeline canceled"); return None; } + _ = cancel.cancelled() => { + info!("task canceled"); + return None; + } _ = flush_lsn_rx.changed() => {} } } @@ -470,6 +505,10 @@ pub async fn main_task( info!("timeline canceled"); return None; } + _ = cancel.cancelled() => { + info!("task canceled"); + return None; + } _ = commit_lsn_rx.changed() => {} _ = flush_lsn_rx.changed() => { let segno = backup.segno(flush_lsn_rx.borrow().lsn); @@ -492,7 +531,13 @@ pub async fn main_task( } // limit concurrent uploads - let _upload_permit = limiter.acquire_partial_backup().await; + let _upload_permit = tokio::select! { + acq = limiter.acquire_partial_backup() => acq, + _ = cancel.cancelled() => { + info!("task canceled"); + return None; + } + }; let prepared = backup.prepare_upload().await; if let Some(seg) = &uploaded_segment { diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index ded8571a3e..89c2e98a94 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -15,6 +15,7 @@ use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogF use postgres_ffi::{dispatch_pgversion, XLogSegNo, PG_TLI}; use remote_storage::RemotePath; use std::cmp::{max, min}; +use std::future::Future; use std::io::{self, SeekFrom}; use std::pin::Pin; use tokio::fs::{self, remove_file, File, OpenOptions}; @@ -35,8 +36,9 @@ use postgres_ffi::XLOG_BLCKSZ; use pq_proto::SystemId; use utils::{id::TenantTimelineId, lsn::Lsn}; -#[async_trait::async_trait] pub trait Storage { + // Last written LSN. + fn write_lsn(&self) -> Lsn; /// LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn; @@ -44,16 +46,19 @@ pub trait Storage { /// the segment and short header at the page of given LSN. This is only used /// for timeline initialization because compute will stream data only since /// init_lsn. Other segment headers are included in compute stream. - async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()>; + fn initialize_first_segment( + &mut self, + init_lsn: Lsn, + ) -> impl Future> + Send; /// Write piece of WAL from buf to disk, but not necessarily sync it. - async fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> Result<()>; + fn write_wal(&mut self, startpos: Lsn, buf: &[u8]) -> impl Future> + Send; /// Truncate WAL at specified LSN, which must be the end of WAL record. - async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()>; + fn truncate_wal(&mut self, end_pos: Lsn) -> impl Future> + Send; /// Durably store WAL on disk, up to the last written WAL record. - async fn flush_wal(&mut self) -> Result<()>; + fn flush_wal(&mut self) -> impl Future> + Send; /// Remove all segments <= given segno. Returns function doing that as we /// want to perform it without timeline lock. @@ -325,8 +330,11 @@ impl PhysicalStorage { } } -#[async_trait::async_trait] impl Storage for PhysicalStorage { + // Last written LSN. + fn write_lsn(&self) -> Lsn { + self.write_lsn + } /// flush_lsn returns LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn { self.flush_record_lsn diff --git a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs index c2db9de78a..b854754ecf 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper_disk.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper_disk.rs @@ -83,7 +83,6 @@ impl DiskStateStorage { } } -#[async_trait::async_trait] impl control_file::Storage for DiskStateStorage { /// Persist safekeeper state on disk and update internal state. async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> { @@ -175,8 +174,11 @@ impl DiskWALStorage { } } -#[async_trait::async_trait] impl wal_storage::Storage for DiskWALStorage { + // Last written LSN. + fn write_lsn(&self) -> Lsn { + self.write_lsn + } /// LSN of last durably stored WAL record. fn flush_lsn(&self) -> Lsn { self.flush_record_lsn diff --git a/storage_controller/client/Cargo.toml b/storage_controller/client/Cargo.toml index c3bfe2bfd2..e7a4264fd0 100644 --- a/storage_controller/client/Cargo.toml +++ b/storage_controller/client/Cargo.toml @@ -8,7 +8,6 @@ license.workspace = true pageserver_api.workspace = true pageserver_client.workspace = true thiserror.workspace = true -async-trait.workspace = true reqwest.workspace = true utils.workspace = true serde.workspace = true diff --git a/storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql b/storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql new file mode 100644 index 0000000000..9dfc750586 --- /dev/null +++ b/storage_controller/migrations/2024-08-23-102952_safekeepers/down.sql @@ -0,0 +1,2 @@ +-- This file should undo anything in `up.sql` +DROP TABLE safekeepers; diff --git a/storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql b/storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql new file mode 100644 index 0000000000..c78716660f --- /dev/null +++ b/storage_controller/migrations/2024-08-23-102952_safekeepers/up.sql @@ -0,0 +1,15 @@ +-- started out as a copy of cplane schema, removed the unnecessary columns. +CREATE TABLE safekeepers ( + -- the surrogate identifier defined by control plane database sequence + id BIGINT PRIMARY KEY, + region_id TEXT NOT NULL, + version BIGINT NOT NULL, + -- the natural id on whatever cloud platform, not needed in storage controller + -- instance_id TEXT UNIQUE NOT NULL, + host TEXT NOT NULL, + port INTEGER NOT NULL, + active BOOLEAN NOT NULL DEFAULT false, + -- projects_count INTEGER NOT NULL DEFAULT 0, + http_port INTEGER NOT NULL, + availability_zone_id TEXT NOT NULL +); diff --git a/storage_controller/migrations/2024-08-27-184400_pageserver_az/down.sql b/storage_controller/migrations/2024-08-27-184400_pageserver_az/down.sql new file mode 100644 index 0000000000..22df81c83c --- /dev/null +++ b/storage_controller/migrations/2024-08-27-184400_pageserver_az/down.sql @@ -0,0 +1 @@ +ALTER TABLE nodes DROP availability_zone_id; diff --git a/storage_controller/migrations/2024-08-27-184400_pageserver_az/up.sql b/storage_controller/migrations/2024-08-27-184400_pageserver_az/up.sql new file mode 100644 index 0000000000..7112f92bf2 --- /dev/null +++ b/storage_controller/migrations/2024-08-27-184400_pageserver_az/up.sql @@ -0,0 +1 @@ +ALTER TABLE nodes ADD availability_zone_id VARCHAR; diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 207bd5a1e6..32882c201a 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -2,6 +2,7 @@ use crate::metrics::{ HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup, METRICS_REGISTRY, }; +use crate::persistence::SafekeeperPersistence; use crate::reconciler::ReconcileError; use crate::service::{LeadershipStatus, Service, STARTUP_RECONCILE_TIMEOUT}; use anyhow::Context; @@ -17,7 +18,7 @@ use pageserver_api::controller_api::{ }; use pageserver_api::models::{ TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest, - TenantTimeTravelRequest, TimelineCreateRequest, + TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, }; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; @@ -101,7 +102,7 @@ async fn handle_validate(mut req: Request) -> Result, ApiEr let validate_req = json_request::(&mut req).await?; let state = get_state(&req); - json_response(StatusCode::OK, state.service.validate(validate_req)) + json_response(StatusCode::OK, state.service.validate(validate_req).await?) } /// Call into this before attaching a tenant to a pageserver, to acquire a generation number @@ -334,6 +335,24 @@ async fn handle_tenant_timeline_delete( .await } +async fn handle_tenant_timeline_archival_config( + service: Arc, + mut req: Request, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?; + check_permissions(&req, Scope::PageServerApi)?; + + let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?; + + let create_req = json_request::(&mut req).await?; + + service + .tenant_timeline_archival_config(tenant_id, timeline_id, create_req) + .await?; + + json_response(StatusCode::OK, ()) +} + async fn handle_tenant_timeline_detach_ancestor( service: Arc, req: Request, @@ -749,6 +768,55 @@ impl From for ApiError { } } +/// Return the safekeeper record by instance id, or 404. +/// +/// Not used by anything except manual testing. +async fn handle_get_safekeeper(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let id = parse_request_param::(&req, "id")?; + + let state = get_state(&req); + + let res = state.service.get_safekeeper(id).await; + + match res { + Ok(b) => json_response(StatusCode::OK, b), + Err(crate::persistence::DatabaseError::Query(diesel::result::Error::NotFound)) => { + Err(ApiError::NotFound("unknown instance_id".into())) + } + Err(other) => Err(other.into()), + } +} + +/// Used as part of deployment scripts. +/// +/// Assumes information is only relayed to storage controller after first selecting an unique id on +/// control plane database, which means we have an id field in the request and payload. +async fn handle_upsert_safekeeper(mut req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let body = json_request::(&mut req).await?; + let id = parse_request_param::(&req, "id")?; + + if id != body.id { + // it should be repeated + return Err(ApiError::BadRequest(anyhow::anyhow!( + "id mismatch: url={id:?}, body={:?}", + body.id + ))); + } + + let state = get_state(&req); + + state.service.upsert_safekeeper(body).await?; + + Ok(Response::builder() + .status(StatusCode::NO_CONTENT) + .body(Body::empty()) + .unwrap()) +} + /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only /// be allowed to run if Service has finished its initial reconciliation. async fn tenant_service_handler( @@ -1109,6 +1177,13 @@ pub fn make_router( .put("/control/v1/step_down", |r| { named_request_span(r, handle_step_down, RequestName("control_v1_step_down")) }) + .get("/control/v1/safekeeper/:id", |r| { + named_request_span(r, handle_get_safekeeper, RequestName("v1_safekeeper")) + }) + .post("/control/v1/safekeeper/:id", |r| { + // id is in the body + named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper")) + }) // Tenant operations // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity. @@ -1160,6 +1235,16 @@ pub fn make_router( RequestName("v1_tenant_timeline"), ) }) + .post( + "/v1/tenant/:tenant_id/timeline/:timeline_id/archival_config", + |r| { + tenant_service_handler( + r, + handle_tenant_timeline_archival_config, + RequestName("v1_tenant_timeline_archival_config"), + ) + }, + ) .put( "/v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor", |r| { diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index e3f29b84e7..00e90f4467 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -11,8 +11,8 @@ use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; use storage_controller::service::chaos_injector::ChaosInjector; use storage_controller::service::{ - Config, Service, MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, - RECONCILER_CONCURRENCY_DEFAULT, + Config, Service, HEARTBEAT_INTERVAL_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT, + MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, }; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; @@ -104,6 +104,10 @@ struct Cli { // a pageserver #[arg(long)] max_secondary_lag_bytes: Option, + + // Period with which to send heartbeats to registered nodes + #[arg(long)] + heartbeat_interval: Option, } enum StrictMode { @@ -285,6 +289,10 @@ async fn async_main() -> anyhow::Result<()> { split_threshold: args.split_threshold, neon_local_repo_dir: args.neon_local_repo_dir, max_secondary_lag_bytes: args.max_secondary_lag_bytes, + heartbeat_interval: args + .heartbeat_interval + .map(humantime::Duration::into) + .unwrap_or(HEARTBEAT_INTERVAL_DEFAULT), address_for_peers: args.address_for_peers, start_as_candidate: args.start_as_candidate, http_service_port: args.listen.port() as i32, diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index 61a44daca9..73cecc491d 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -36,6 +36,8 @@ pub(crate) struct Node { listen_pg_addr: String, listen_pg_port: u16, + availability_zone_id: Option, + // This cancellation token means "stop any RPCs in flight to this node, and don't start // any more". It is not related to process shutdown. #[serde(skip)] @@ -61,6 +63,10 @@ impl Node { self.id } + pub(crate) fn get_availability_zone_id(&self) -> Option<&str> { + self.availability_zone_id.as_deref() + } + pub(crate) fn get_scheduling(&self) -> NodeSchedulingPolicy { self.scheduling } @@ -72,7 +78,18 @@ impl Node { /// Does this registration request match `self`? This is used when deciding whether a registration /// request should be allowed to update an existing record with the same node ID. pub(crate) fn registration_match(&self, register_req: &NodeRegisterRequest) -> bool { - self.id == register_req.node_id + let az_ids_match = { + match ( + self.availability_zone_id.as_deref(), + register_req.availability_zone_id.as_deref(), + ) { + (Some(current_az), Some(register_req_az)) => current_az == register_req_az, + _ => true, + } + }; + + az_ids_match + && self.id == register_req.node_id && self.listen_http_addr == register_req.listen_http_addr && self.listen_http_port == register_req.listen_http_port && self.listen_pg_addr == register_req.listen_pg_addr @@ -173,6 +190,7 @@ impl Node { listen_http_port: u16, listen_pg_addr: String, listen_pg_port: u16, + availability_zone_id: Option, ) -> Self { Self { id, @@ -182,6 +200,7 @@ impl Node { listen_pg_port, scheduling: NodeSchedulingPolicy::Active, availability: NodeAvailability::Offline, + availability_zone_id, cancel: CancellationToken::new(), } } @@ -194,6 +213,7 @@ impl Node { listen_http_port: self.listen_http_port as i32, listen_pg_addr: self.listen_pg_addr.clone(), listen_pg_port: self.listen_pg_port as i32, + availability_zone_id: self.availability_zone_id.clone(), } } @@ -208,6 +228,7 @@ impl Node { listen_http_port: np.listen_http_port as u16, listen_pg_addr: np.listen_pg_addr, listen_pg_port: np.listen_pg_port as u16, + availability_zone_id: np.availability_zone_id, cancel: CancellationToken::new(), } } diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 8d64201cd9..20770ed703 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -2,8 +2,8 @@ use pageserver_api::{ models::{ detach_ancestor::AncestorDetached, LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress, TenantScanRemoteStorageResponse, - TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo, - TopTenantShardsRequest, TopTenantShardsResponse, + TenantShardSplitRequest, TenantShardSplitResponse, TimelineArchivalConfigRequest, + TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse, }, shard::TenantShardId, }; @@ -227,6 +227,22 @@ impl PageserverClient { ) } + pub(crate) async fn timeline_archival_config( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + req: &TimelineArchivalConfigRequest, + ) -> Result<()> { + measured_request!( + "timeline_archival_config", + crate::metrics::Method::Post, + &self.node_id_label, + self.inner + .timeline_archival_config(tenant_shard_id, timeline_id, req) + .await + ) + } + pub(crate) async fn timeline_detach_ancestor( &self, tenant_shard_id: TenantShardId, diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 1a905753a1..e801289752 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -8,6 +8,7 @@ use self::split_state::SplitState; use diesel::pg::PgConnection; use diesel::prelude::*; use diesel::Connection; +use itertools::Itertools; use pageserver_api::controller_api::MetadataHealthRecord; use pageserver_api::controller_api::ShardSchedulingPolicy; use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy}; @@ -91,7 +92,8 @@ pub(crate) enum DatabaseOperation { Detach, ReAttach, IncrementGeneration, - PeekGenerations, + TenantGenerations, + ShardGenerations, ListTenantShards, InsertTenantShards, UpdateTenantShard, @@ -103,6 +105,7 @@ pub(crate) enum DatabaseOperation { ListMetadataHealthOutdated, GetLeader, UpdateLeader, + SetNodeAzId, } #[must_use] @@ -121,6 +124,13 @@ pub(crate) enum TenantFilter { Shard(TenantShardId), } +/// Represents the results of looking up generation+pageserver for the shards of a tenant +pub(crate) struct ShardGenerationState { + pub(crate) tenant_shard_id: TenantShardId, + pub(crate) generation: Option, + pub(crate) generation_pageserver: Option, +} + impl Persistence { // The default postgres connection limit is 100. We use up to 99, to leave one free for a human admin under // normal circumstances. This assumes we have exclusive use of the database cluster to which we connect. @@ -315,6 +325,31 @@ impl Persistence { } } + pub(crate) async fn set_node_availability_zone_id( + &self, + input_node_id: NodeId, + input_az_id: String, + ) -> DatabaseResult<()> { + use crate::schema::nodes::dsl::*; + let updated = self + .with_measured_conn(DatabaseOperation::SetNodeAzId, move |conn| { + let updated = diesel::update(nodes) + .filter(node_id.eq(input_node_id.0 as i64)) + .set((availability_zone_id.eq(input_az_id.clone()),)) + .execute(conn)?; + Ok(updated) + }) + .await?; + + if updated != 1 { + Err(DatabaseError::Logical(format!( + "Node {node_id:?} not found for setting az id", + ))) + } else { + Ok(()) + } + } + /// At startup, load the high level state for shards, such as their config + policy. This will /// be enriched at runtime with state discovered on pageservers. pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult> { @@ -511,13 +546,13 @@ impl Persistence { /// If the tenant doesn't exist, an empty vector is returned. /// /// Output is sorted by shard number - pub(crate) async fn peek_generations( + pub(crate) async fn tenant_generations( &self, filter_tenant_id: TenantId, - ) -> Result, Option)>, DatabaseError> { + ) -> Result, DatabaseError> { use crate::schema::tenant_shards::dsl::*; let rows = self - .with_measured_conn(DatabaseOperation::PeekGenerations, move |conn| { + .with_measured_conn(DatabaseOperation::TenantGenerations, move |conn| { let result = tenant_shards .filter(tenant_id.eq(filter_tenant_id.to_string())) .select(TenantShardPersistence::as_select()) @@ -529,12 +564,69 @@ impl Persistence { Ok(rows .into_iter() - .map(|p| { + .map(|p| ShardGenerationState { + tenant_shard_id: p + .get_tenant_shard_id() + .expect("Corrupt tenant shard id in database"), + generation: p.generation.map(|g| Generation::new(g as u32)), + generation_pageserver: p.generation_pageserver.map(|n| NodeId(n as u64)), + }) + .collect()) + } + + /// Read the generation number of specific tenant shards + /// + /// Output is unsorted. Output may not include values for all inputs, if they are missing in the database. + pub(crate) async fn shard_generations( + &self, + mut tenant_shard_ids: impl Iterator, + ) -> Result)>, DatabaseError> { + let mut rows = Vec::with_capacity(tenant_shard_ids.size_hint().0); + + // We will chunk our input to avoid composing arbitrarily long `IN` clauses. Typically we are + // called with a single digit number of IDs, but in principle we could be called with tens + // of thousands (all the shards on one pageserver) from the generation validation API. + loop { + // A modest hardcoded chunk size to handle typical cases in a single query but never generate particularly + // large query strings. + let chunk_ids = tenant_shard_ids.by_ref().take(32); + + // Compose a comma separated list of tuples for matching on (tenant_id, shard_number, shard_count) + let in_clause = chunk_ids + .map(|tsid| { + format!( + "('{}', {}, {})", + tsid.tenant_id, tsid.shard_number.0, tsid.shard_count.0 + ) + }) + .join(","); + + // We are done when our iterator gives us nothing to filter on + if in_clause.is_empty() { + break; + } + + let chunk_rows = self + .with_measured_conn(DatabaseOperation::ShardGenerations, move |conn| { + // diesel doesn't support multi-column IN queries, so we compose raw SQL. No escaping is required because + // the inputs are strongly typed and cannot carry any user-supplied raw string content. + let result : Vec = diesel::sql_query( + format!("SELECT * from tenant_shards where (tenant_id, shard_number, shard_count) in ({in_clause});").as_str() + ).load(conn)?; + + Ok(result) + }) + .await?; + rows.extend(chunk_rows.into_iter()) + } + + Ok(rows + .into_iter() + .map(|tsp| { ( - p.get_tenant_shard_id() - .expect("Corrupt tenant shard id in database"), - p.generation.map(|g| Generation::new(g as u32)), - p.generation_pageserver.map(|n| NodeId(n as u64)), + tsp.get_tenant_shard_id() + .expect("Bad tenant ID in database"), + tsp.generation.map(|g| Generation::new(g as u32)), ) }) .collect()) @@ -906,10 +998,54 @@ impl Persistence { Ok(()) } + + pub(crate) async fn safekeeper_get( + &self, + id: i64, + ) -> Result { + use crate::schema::safekeepers::dsl::{id as id_column, safekeepers}; + self.with_conn(move |conn| -> DatabaseResult { + Ok(safekeepers + .filter(id_column.eq(&id)) + .select(SafekeeperPersistence::as_select()) + .get_result(conn)?) + }) + .await + } + + pub(crate) async fn safekeeper_upsert( + &self, + record: SafekeeperPersistence, + ) -> Result<(), DatabaseError> { + use crate::schema::safekeepers::dsl::*; + + self.with_conn(move |conn| -> DatabaseResult<()> { + let bind = record.as_insert_or_update(); + + let inserted_updated = diesel::insert_into(safekeepers) + .values(&bind) + .on_conflict(id) + .do_update() + .set(&bind) + .execute(conn)?; + + if inserted_updated != 1 { + return Err(DatabaseError::Logical(format!( + "unexpected number of rows ({})", + inserted_updated + ))); + } + + Ok(()) + }) + .await + } } /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably -#[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)] +#[derive( + QueryableByName, Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq, +)] #[diesel(table_name = crate::schema::tenant_shards)] pub(crate) struct TenantShardPersistence { #[serde(default)] @@ -974,6 +1110,7 @@ pub(crate) struct NodePersistence { pub(crate) listen_http_port: i32, pub(crate) listen_pg_addr: String, pub(crate) listen_pg_port: i32, + pub(crate) availability_zone_id: Option, } /// Tenant metadata health status that are stored durably. @@ -1040,3 +1177,47 @@ pub(crate) struct ControllerPersistence { pub(crate) address: String, pub(crate) started_at: chrono::DateTime, } + +#[derive(Serialize, Deserialize, Queryable, Selectable, Eq, PartialEq, Debug, Clone)] +#[diesel(table_name = crate::schema::safekeepers)] +pub(crate) struct SafekeeperPersistence { + pub(crate) id: i64, + pub(crate) region_id: String, + /// 1 is special, it means just created (not currently posted to storcon). + /// Zero or negative is not really expected. + /// Otherwise the number from `release-$(number_of_commits_on_branch)` tag. + pub(crate) version: i64, + pub(crate) host: String, + pub(crate) port: i32, + pub(crate) active: bool, + pub(crate) http_port: i32, + pub(crate) availability_zone_id: String, +} + +impl SafekeeperPersistence { + fn as_insert_or_update(&self) -> InsertUpdateSafekeeper<'_> { + InsertUpdateSafekeeper { + id: self.id, + region_id: &self.region_id, + version: self.version, + host: &self.host, + port: self.port, + active: self.active, + http_port: self.http_port, + availability_zone_id: &self.availability_zone_id, + } + } +} + +#[derive(Insertable, AsChangeset)] +#[diesel(table_name = crate::schema::safekeepers)] +struct InsertUpdateSafekeeper<'a> { + id: i64, + region_id: &'a str, + version: i64, + host: &'a str, + port: i32, + active: bool, + http_port: i32, + availability_zone_id: &'a str, +} diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 94db879ade..83b7b2b4f2 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -12,10 +12,12 @@ use std::collections::HashMap; use std::sync::Arc; use std::time::{Duration, Instant}; use tokio_util::sync::CancellationToken; +use utils::backoff::exponential_backoff; use utils::failpoint_support; use utils::generation::Generation; use utils::id::{NodeId, TimelineId}; use utils::lsn::Lsn; +use utils::pausable_failpoint; use utils::sync::gate::GateGuard; use crate::compute_hook::{ComputeHook, NotifyError}; @@ -568,6 +570,7 @@ impl Reconciler { // During a live migration it is unhelpful to proceed if we couldn't notify compute: if we detach // the origin without notifying compute, we will render the tenant unavailable. + let mut notify_attempts = 0; while let Err(e) = self.compute_notify().await { match e { NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)), @@ -578,8 +581,21 @@ impl Reconciler { ); } } + + exponential_backoff( + notify_attempts, + // Generous waits: control plane operations which might be blocking us usually complete on the order + // of hundreds to thousands of milliseconds, so no point busy polling. + 1.0, + 10.0, + &self.cancel, + ) + .await; + notify_attempts += 1; } + pausable_failpoint!("reconciler-live-migrate-post-notify"); + // Downgrade the origin to secondary. If the tenant's policy is PlacementPolicy::Attached(0), then // this location will be deleted in the general case reconciliation that runs after this. let origin_secondary_conf = build_location_config( diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 060e3cc6ca..ef4da6861c 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -528,6 +528,7 @@ pub(crate) mod test_utils { 80 + i as u16, format!("pghost-{i}"), 5432 + i as u16, + None, ); node.set_availability(NodeAvailability::Active(test_utilization::simple(0, 0))); assert!(node.is_available()); diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs index 77ba47e114..e0f515daea 100644 --- a/storage_controller/src/schema.rs +++ b/storage_controller/src/schema.rs @@ -25,6 +25,7 @@ diesel::table! { listen_http_port -> Int4, listen_pg_addr -> Varchar, listen_pg_port -> Int4, + availability_zone_id -> Nullable, } } @@ -44,3 +45,17 @@ diesel::table! { } diesel::allow_tables_to_appear_in_same_query!(controllers, metadata_health, nodes, tenant_shards,); + +diesel::table! { + safekeepers { + id -> Int8, + region_id -> Text, + version -> Int8, + instance_id -> Text, + host -> Text, + port -> Int4, + active -> Bool, + http_port -> Int4, + availability_zone_id -> Text, + } +} diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 7daa1e4f5f..ca416095bb 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -22,7 +22,7 @@ use crate::{ peer_client::GlobalObservedState, persistence::{ AbortShardSplitStatus, ControllerPersistence, DatabaseResult, MetadataHealthPersistence, - TenantFilter, + ShardGenerationState, TenantFilter, }, reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder}, scheduler::{MaySchedule, ScheduleContext, ScheduleMode}, @@ -46,7 +46,10 @@ use pageserver_api::{ TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, }, - models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest}, + models::{ + SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest, + TopTenantShardsRequest, + }, }; use reqwest::StatusCode; use tracing::{instrument, Instrument}; @@ -118,6 +121,9 @@ pub const MAX_OFFLINE_INTERVAL_DEFAULT: Duration = Duration::from_secs(30); /// being handled on the pageserver side. pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300); +/// How often to send heartbeats to registered nodes? +pub const HEARTBEAT_INTERVAL_DEFAULT: Duration = Duration::from_secs(5); + #[derive(Clone, strum_macros::Display)] enum TenantOperations { Create, @@ -131,6 +137,7 @@ enum TenantOperations { TimelineCreate, TimelineDelete, AttachHook, + TimelineArchivalConfig, TimelineDetachAncestor, } @@ -322,6 +329,8 @@ pub struct Config { // upgraded to primary. pub max_secondary_lag_bytes: Option, + pub heartbeat_interval: Duration, + pub address_for_peers: Option, pub start_as_candidate: bool, @@ -905,9 +914,7 @@ impl Service { async fn spawn_heartbeat_driver(&self) { self.startup_complete.clone().wait().await; - const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(5); - - let mut interval = tokio::time::interval(HEARTBEAT_INTERVAL); + let mut interval = tokio::time::interval(self.config.heartbeat_interval); while !self.cancel.is_cancelled() { tokio::select! { _ = interval.tick() => { } @@ -1257,6 +1264,7 @@ impl Service { 123, "".to_string(), 123, + None, ); scheduler.node_upsert(&node); @@ -1846,37 +1854,74 @@ impl Service { Ok(response) } - pub(crate) fn validate(&self, validate_req: ValidateRequest) -> ValidateResponse { - let locked = self.inner.read().unwrap(); + pub(crate) async fn validate( + &self, + validate_req: ValidateRequest, + ) -> Result { + // Fast in-memory check: we may reject validation on anything that doesn't match our + // in-memory generation for a shard + let in_memory_result = { + let mut in_memory_result = Vec::new(); + let locked = self.inner.read().unwrap(); + for req_tenant in validate_req.tenants { + if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) { + let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen)); + tracing::info!( + "handle_validate: {}(gen {}): valid={valid} (latest {:?})", + req_tenant.id, + req_tenant.gen, + tenant_shard.generation + ); + + in_memory_result.push((req_tenant.id, Generation::new(req_tenant.gen), valid)); + } else { + // This is legal: for example during a shard split the pageserver may still + // have deletions in its queue from the old pre-split shard, or after deletion + // of a tenant that was busy with compaction/gc while being deleted. + tracing::info!( + "Refusing deletion validation for missing shard {}", + req_tenant.id + ); + } + } + + in_memory_result + }; + + // Database calls to confirm validity for anything that passed the in-memory check. We must do this + // in case of controller split-brain, where some other controller process might have incremented the generation. + let db_generations = self + .persistence + .shard_generations(in_memory_result.iter().filter_map(|i| { + if i.2 { + Some(&i.0) + } else { + None + } + })) + .await?; + let db_generations = db_generations.into_iter().collect::>(); let mut response = ValidateResponse { tenants: Vec::new(), }; - - for req_tenant in validate_req.tenants { - if let Some(tenant_shard) = locked.tenants.get(&req_tenant.id) { - let valid = tenant_shard.generation == Some(Generation::new(req_tenant.gen)); - tracing::info!( - "handle_validate: {}(gen {}): valid={valid} (latest {:?})", - req_tenant.id, - req_tenant.gen, - tenant_shard.generation - ); - response.tenants.push(ValidateResponseTenant { - id: req_tenant.id, - valid, - }); + for (tenant_shard_id, validate_generation, valid) in in_memory_result.into_iter() { + let valid = if valid { + let db_generation = db_generations.get(&tenant_shard_id); + db_generation == Some(&Some(validate_generation)) } else { - // After tenant deletion, we may approve any validation. This avoids - // spurious warnings on the pageserver if it has pending LSN updates - // at the point a deletion happens. - response.tenants.push(ValidateResponseTenant { - id: req_tenant.id, - valid: true, - }); - } + // If in-memory state says it's invalid, trust that. It's always safe to fail a validation, at worst + // this prevents a pageserver from cleaning up an object in S3. + false + }; + + response.tenants.push(ValidateResponseTenant { + id: tenant_shard_id, + valid, + }) } - response + + Ok(response) } pub(crate) async fn tenant_create( @@ -2917,6 +2962,73 @@ impl Service { .await? } + pub(crate) async fn tenant_timeline_archival_config( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: TimelineArchivalConfigRequest, + ) -> Result<(), ApiError> { + tracing::info!( + "Setting archival config of timeline {tenant_id}/{timeline_id} to '{:?}'", + req.state + ); + + let _tenant_lock = trace_shared_lock( + &self.tenant_op_locks, + tenant_id, + TenantOperations::TimelineArchivalConfig, + ) + .await; + + self.tenant_remote_mutation(tenant_id, move |targets| async move { + if targets.is_empty() { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant not found").into(), + )); + } + async fn config_one( + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + node: Node, + jwt: Option, + req: TimelineArchivalConfigRequest, + ) -> Result<(), ApiError> { + tracing::info!( + "Setting archival config of timeline on shard {tenant_shard_id}/{timeline_id}, attached to node {node}", + ); + + let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + + client + .timeline_archival_config(tenant_shard_id, timeline_id, &req) + .await + .map_err(|e| match e { + mgmt_api::Error::ApiError(StatusCode::PRECONDITION_FAILED, msg) => { + ApiError::PreconditionFailed(msg.into_boxed_str()) + } + _ => passthrough_api_error(&node, e), + }) + } + + // no shard needs to go first/last; the operation should be idempotent + // TODO: it would be great to ensure that all shards return the same error + let results = self + .tenant_for_shards(targets, |tenant_shard_id, node| { + futures::FutureExt::boxed(config_one( + tenant_shard_id, + timeline_id, + node, + self.config.jwt_token.clone(), + req.clone(), + )) + }) + .await?; + assert!(!results.is_empty(), "must have at least one result"); + + Ok(()) + }).await? + } + pub(crate) async fn tenant_timeline_detach_ancestor( &self, tenant_id: TenantId, @@ -3104,21 +3216,45 @@ impl Service { // run concurrently with reconciliations, and it is not guaranteed that the node we find here // will still be the latest when we're done: we will check generations again at the end of // this function to handle that. - let generations = self.persistence.peek_generations(tenant_id).await?; - let generations = if generations.iter().any(|i| i.1.is_none()) { - // One or more shards is not attached to anything: maybe this is a new tenant? Wait for - // it to reconcile. - self.ensure_attached_wait(tenant_id).await?; - self.persistence.peek_generations(tenant_id).await? - } else { - generations - }; + let generations = self.persistence.tenant_generations(tenant_id).await?; + + if generations + .iter() + .any(|i| i.generation.is_none() || i.generation_pageserver.is_none()) + { + // One or more shards has not been attached to a pageserver. Check if this is because it's configured + // to be detached (409: caller should give up), or because it's meant to be attached but isn't yet (503: caller should retry) + let locked = self.inner.read().unwrap(); + for (shard_id, shard) in + locked.tenants.range(TenantShardId::tenant_range(tenant_id)) + { + match shard.policy { + PlacementPolicy::Attached(_) => { + // This shard is meant to be attached: the caller is not wrong to try and + // use this function, but we can't service the request right now. + } + PlacementPolicy::Secondary | PlacementPolicy::Detached => { + return Err(ApiError::Conflict(format!( + "Shard {shard_id} tenant has policy {:?}", + shard.policy + ))); + } + } + } + + return Err(ApiError::ResourceUnavailable( + "One or more shards in tenant is not yet attached".into(), + )); + } let locked = self.inner.read().unwrap(); - for (tenant_shard_id, generation, generation_pageserver) in generations { - let node_id = generation_pageserver.ok_or(ApiError::Conflict( - "Tenant not currently attached".to_string(), - ))?; + for ShardGenerationState { + tenant_shard_id, + generation, + generation_pageserver, + } in generations + { + let node_id = generation_pageserver.expect("We checked for None above"); let node = locked .nodes .get(&node_id) @@ -3137,10 +3273,16 @@ impl Service { // Post-check: are all the generations of all the shards the same as they were initially? This proves that // our remote operation executed on the latest generation and is therefore persistent. { - let latest_generations = self.persistence.peek_generations(tenant_id).await?; + let latest_generations = self.persistence.tenant_generations(tenant_id).await?; if latest_generations .into_iter() - .map(|g| (g.0, g.1)) + .map( + |ShardGenerationState { + tenant_shard_id, + generation, + generation_pageserver: _, + }| (tenant_shard_id, generation), + ) .collect::>() != target_gens .into_iter() @@ -4683,29 +4825,84 @@ impl Service { ) .await; - { + if register_req.availability_zone_id.is_none() { + tracing::warn!( + "Node {} registering without specific availability zone id", + register_req.node_id + ); + } + + enum RegistrationStatus { + Matched(Node), + Mismatched, + New, + } + + let registration_status = { let locked = self.inner.read().unwrap(); if let Some(node) = locked.nodes.get(®ister_req.node_id) { - // Note that we do not do a total equality of the struct, because we don't require - // the availability/scheduling states to agree for a POST to be idempotent. if node.registration_match(®ister_req) { - tracing::info!( - "Node {} re-registered with matching address", - register_req.node_id - ); - return Ok(()); + RegistrationStatus::Matched(node.clone()) } else { - // TODO: decide if we want to allow modifying node addresses without removing and re-adding - // the node. Safest/simplest thing is to refuse it, and usually we deploy with - // a fixed address through the lifetime of a node. - tracing::warn!( - "Node {} tried to register with different address", - register_req.node_id - ); - return Err(ApiError::Conflict( - "Node is already registered with different address".to_string(), - )); + RegistrationStatus::Mismatched } + } else { + RegistrationStatus::New + } + }; + + match registration_status { + RegistrationStatus::Matched(node) => { + tracing::info!( + "Node {} re-registered with matching address", + register_req.node_id + ); + + if node.get_availability_zone_id().is_none() { + if let Some(az_id) = register_req.availability_zone_id.clone() { + tracing::info!("Extracting availability zone id from registration request for node {}: {}", + register_req.node_id, az_id); + + // Persist to the database and update in memory state. See comment below + // on ordering. + self.persistence + .set_node_availability_zone_id(register_req.node_id, az_id) + .await?; + let node_with_az = Node::new( + register_req.node_id, + register_req.listen_http_addr, + register_req.listen_http_port, + register_req.listen_pg_addr, + register_req.listen_pg_port, + register_req.availability_zone_id, + ); + + let mut locked = self.inner.write().unwrap(); + let mut new_nodes = (*locked.nodes).clone(); + + locked.scheduler.node_upsert(&node_with_az); + new_nodes.insert(register_req.node_id, node_with_az); + + locked.nodes = Arc::new(new_nodes); + } + } + + return Ok(()); + } + RegistrationStatus::Mismatched => { + // TODO: decide if we want to allow modifying node addresses without removing and re-adding + // the node. Safest/simplest thing is to refuse it, and usually we deploy with + // a fixed address through the lifetime of a node. + tracing::warn!( + "Node {} tried to register with different address", + register_req.node_id + ); + return Err(ApiError::Conflict( + "Node is already registered with different address".to_string(), + )); + } + RegistrationStatus::New => { + // fallthrough } } @@ -4742,6 +4939,7 @@ impl Service { register_req.listen_http_port, register_req.listen_pg_addr, register_req.listen_pg_port, + register_req.availability_zone_id, ); // TODO: idempotency if the node already exists in the database @@ -5223,72 +5421,6 @@ impl Service { )) } - /// Helper for methods that will try and call pageserver APIs for - /// a tenant, such as timeline CRUD: they cannot proceed unless the tenant - /// is attached somewhere. - fn ensure_attached_schedule( - &self, - mut locked: std::sync::RwLockWriteGuard<'_, ServiceState>, - tenant_id: TenantId, - ) -> Result, anyhow::Error> { - let mut waiters = Vec::new(); - let (nodes, tenants, scheduler) = locked.parts_mut(); - - let mut schedule_context = ScheduleContext::default(); - for (tenant_shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { - shard.schedule(scheduler, &mut schedule_context)?; - - // The shard's policies may not result in an attached location being scheduled: this - // is an error because our caller needs it attached somewhere. - if shard.intent.get_attached().is_none() { - return Err(anyhow::anyhow!( - "Tenant {tenant_id} not scheduled to be attached" - )); - }; - - if shard.stably_attached().is_some() { - // We do not require the shard to be totally up to date on reconciliation: we just require - // that it has been attached on the intended node. Other dirty state such as unattached secondary - // locations, or compute hook notifications can be ignored. - continue; - } - - if let Some(waiter) = self.maybe_reconcile_shard(shard, nodes) { - tracing::info!("Waiting for shard {tenant_shard_id} to reconcile, in order to ensure it is attached"); - waiters.push(waiter); - } - } - Ok(waiters) - } - - async fn ensure_attached_wait(&self, tenant_id: TenantId) -> Result<(), ApiError> { - let ensure_waiters = { - let locked = self.inner.write().unwrap(); - - // Check if the tenant is splitting: in this case, even if it is attached, - // we must act as if it is not: this blocks e.g. timeline creation/deletion - // operations during the split. - for (_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) { - if !matches!(shard.splitting, SplitState::Idle) { - return Err(ApiError::ResourceUnavailable( - "Tenant shards are currently splitting".into(), - )); - } - } - - self.ensure_attached_schedule(locked, tenant_id) - .map_err(ApiError::InternalServerError)? - }; - - let deadline = Instant::now().checked_add(Duration::from_secs(5)).unwrap(); - for waiter in ensure_waiters { - let timeout = deadline.duration_since(Instant::now()); - waiter.wait_timeout(timeout).await?; - } - - Ok(()) - } - /// Like [`Self::maybe_configured_reconcile_shard`], but uses the default reconciler /// configuration fn maybe_reconcile_shard( @@ -6165,9 +6297,13 @@ impl Service { node_id: NodeId, cancel: CancellationToken, ) -> Result<(), OperationError> { - // TODO(vlad): Currently this operates on the assumption that all - // secondaries are warm. This is not always true (e.g. we just migrated the - // tenant). Take that into consideration by checking the secondary status. + const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); + const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); + let reconciler_config = ReconcilerConfigBuilder::new() + .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) + .secondary_download_request_timeout(SECONDARY_DOWNLOAD_REQUEST_TIMEOUT) + .build(); + let mut tids_to_promote = self.fill_node_plan(node_id); let mut waiters = Vec::new(); @@ -6235,9 +6371,11 @@ impl Service { node_id ); - if let Some(waiter) = - self.maybe_reconcile_shard(tenant_shard, nodes) - { + if let Some(waiter) = self.maybe_configured_reconcile_shard( + tenant_shard, + nodes, + reconciler_config, + ) { waiters.push(waiter); } } @@ -6381,4 +6519,18 @@ impl Service { global_observed } + + pub(crate) async fn get_safekeeper( + &self, + id: i64, + ) -> Result { + self.persistence.safekeeper_get(id).await + } + + pub(crate) async fn upsert_safekeeper( + &self, + record: crate::persistence::SafekeeperPersistence, + ) -> Result<(), DatabaseError> { + self.persistence.safekeeper_upsert(record).await + } } diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index 08b0f06ebf..15dfb101b5 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -1,6 +1,7 @@ -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeSet, HashMap, HashSet}; use anyhow::Context; +use itertools::Itertools; use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; use pageserver_api::shard::ShardIndex; @@ -47,6 +48,56 @@ impl TimelineAnalysis { } } +/// Checks whether a layer map is valid (i.e., is a valid result of the current compaction algorithm if nothing goes wrong). +/// The function checks if we can split the LSN range of a delta layer only at the LSNs of the delta layers. For example, +/// +/// ```plain +/// | | | | +/// | 1 | | 2 | | 3 | +/// | | | | | | +/// ``` +/// +/// This is not a valid layer map because the LSN range of layer 1 intersects with the LSN range of layer 2. 1 and 2 should have +/// the same LSN range. +/// +/// The exception is that when layer 2 only contains a single key, it could be split over the LSN range. For example, +/// +/// ```plain +/// | | | 2 | | | +/// | 1 | |-------| | 3 | +/// | | | 4 | | | +/// +/// If layer 2 and 4 contain the same single key, this is also a valid layer map. +fn check_valid_layermap(metadata: &HashMap) -> Option { + let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?) + let mut all_delta_layers = Vec::new(); + for (name, _) in metadata.iter() { + if let LayerName::Delta(layer) = name { + if layer.key_range.start.next() != layer.key_range.end { + all_delta_layers.push(layer.clone()); + } + } + } + for layer in &all_delta_layers { + let lsn_range = &layer.lsn_range; + lsn_split_point.insert(lsn_range.start); + lsn_split_point.insert(lsn_range.end); + } + for layer in &all_delta_layers { + let lsn_range = layer.lsn_range.clone(); + let intersects = lsn_split_point.range(lsn_range).collect_vec(); + if intersects.len() > 1 { + let err = format!( + "layer violates the layer map LSN split assumption: layer {} intersects with LSN [{}]", + layer, + intersects.into_iter().map(|lsn| lsn.to_string()).join(", ") + ); + return Some(err); + } + } + None +} + pub(crate) async fn branch_cleanup_and_check_errors( remote_client: &GenericRemoteStorage, id: &TenantShardTimelineId, @@ -126,6 +177,12 @@ pub(crate) async fn branch_cleanup_and_check_errors( } } + if let Some(err) = check_valid_layermap(&index_part.layer_metadata) { + result.errors.push(format!( + "index_part.json contains invalid layer map structure: {err}" + )); + } + for (layer, metadata) in index_part.layer_metadata { if metadata.file_size == 0 { result.errors.push(format!( diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index 112f052e07..3f08cddf50 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -36,7 +36,7 @@ use serde::{Deserialize, Serialize}; use storage_controller_client::control_api; use tokio::io::AsyncReadExt; use tokio_util::sync::CancellationToken; -use tracing::error; +use tracing::{error, warn}; use tracing_appender::non_blocking::WorkerGuard; use tracing_subscriber::{fmt, prelude::*, EnvFilter}; use utils::fs_ext; @@ -422,7 +422,7 @@ fn stream_objects_with_retries<'a>( let yield_err = if err.is_permanent() { true } else { - let backoff_time = 1 << trial.max(5); + let backoff_time = 1 << trial.min(5); tokio::time::sleep(Duration::from_secs(backoff_time)).await; trial += 1; trial == MAX_RETRIES - 1 @@ -466,14 +466,14 @@ async fn list_objects_with_retries( return Err(e) .with_context(|| format!("Failed to list objects {MAX_RETRIES} times")); } - error!( + warn!( "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}", s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter, DisplayErrorContext(e), ); - let backoff_time = 1 << trial.max(5); + let backoff_time = 1 << trial.min(5); tokio::time::sleep(Duration::from_secs(backoff_time)).await; } } @@ -492,7 +492,7 @@ async fn download_object_with_retries( Ok(response) => response, Err(e) => { error!("Failed to download object for key {key}: {e}"); - let backoff_time = 1 << trial.max(5); + let backoff_time = 1 << trial.min(5); tokio::time::sleep(Duration::from_secs(backoff_time)).await; continue; } @@ -508,7 +508,7 @@ async fn download_object_with_retries( } Err(e) => { error!("Failed to stream object body for key {key}: {e}"); - let backoff_time = 1 << trial.max(5); + let backoff_time = 1 << trial.min(5); tokio::time::sleep(Duration::from_secs(backoff_time)).await; } } diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index 3935e513e3..c5961753c5 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -1,4 +1,4 @@ -use anyhow::{anyhow, bail}; +use anyhow::{anyhow, bail, Context}; use camino::Utf8PathBuf; use pageserver_api::controller_api::{MetadataHealthUpdateRequest, MetadataHealthUpdateResponse}; use pageserver_api::shard::TenantShardId; @@ -7,6 +7,7 @@ use storage_controller_client::control_api; use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; use storage_scrubber::pageserver_physical_gc::GcMode; use storage_scrubber::scan_pageserver_metadata::scan_pageserver_metadata; +use storage_scrubber::scan_safekeeper_metadata::DatabaseOrList; use storage_scrubber::tenant_snapshot::SnapshotDownloader; use storage_scrubber::{find_large_objects, ControllerClientConfig}; use storage_scrubber::{ @@ -76,6 +77,9 @@ enum Command { /// For safekeeper node_kind only, table in the db with debug dump #[arg(long, default_value = None)] dump_db_table: Option, + /// For safekeeper node_kind only, json list of timelines and their lsn info + #[arg(long, default_value = None)] + timeline_lsns: Option, }, TenantSnapshot { #[arg(long = "tenant-id")] @@ -155,20 +159,22 @@ async fn main() -> anyhow::Result<()> { post_to_storcon, dump_db_connstr, dump_db_table, + timeline_lsns, } => { if let NodeKind::Safekeeper = node_kind { - let dump_db_connstr = - dump_db_connstr.ok_or(anyhow::anyhow!("dump_db_connstr not specified"))?; - let dump_db_table = - dump_db_table.ok_or(anyhow::anyhow!("dump_db_table not specified"))?; - - let summary = scan_safekeeper_metadata( - bucket_config.clone(), - tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(), - dump_db_connstr, - dump_db_table, - ) - .await?; + let db_or_list = match (timeline_lsns, dump_db_connstr) { + (Some(timeline_lsns), _) => { + let timeline_lsns = serde_json::from_str(&timeline_lsns).context("parsing timeline_lsns")?; + DatabaseOrList::List(timeline_lsns) + } + (None, Some(dump_db_connstr)) => { + let dump_db_table = dump_db_table.ok_or_else(|| anyhow::anyhow!("dump_db_table not specified"))?; + let tenant_ids = tenant_ids.iter().map(|tshid| tshid.tenant_id).collect(); + DatabaseOrList::Database { tenant_ids, connstr: dump_db_connstr, table: dump_db_table } + } + (None, None) => anyhow::bail!("neither `timeline_lsns` specified, nor `dump_db_connstr` and `dump_db_table`"), + }; + let summary = scan_safekeeper_metadata(bucket_config.clone(), db_or_list).await?; if json { println!("{}", serde_json::to_string(&summary).unwrap()) } else { diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs index 1a9f3d0ef5..15f3665fac 100644 --- a/storage_scrubber/src/scan_safekeeper_metadata.rs +++ b/storage_scrubber/src/scan_safekeeper_metadata.rs @@ -7,7 +7,7 @@ use postgres_ffi::{XLogFileName, PG_TLI}; use remote_storage::GenericRemoteStorage; use serde::Serialize; use tokio_postgres::types::PgLsn; -use tracing::{error, info, trace}; +use tracing::{debug, error, info}; use utils::{ id::{TenantId, TenantTimelineId, TimelineId}, lsn::Lsn, @@ -54,6 +54,23 @@ impl MetadataSummary { } } +#[derive(serde::Deserialize)] +pub struct TimelineLsnData { + tenant_id: String, + timeline_id: String, + timeline_start_lsn: Lsn, + backup_lsn: Lsn, +} + +pub enum DatabaseOrList { + Database { + tenant_ids: Vec, + connstr: String, + table: String, + }, + List(Vec), +} + /// Scan the safekeeper metadata in an S3 bucket, reporting errors and /// statistics. /// @@ -63,68 +80,39 @@ impl MetadataSummary { /// the project wasn't deleted in the meanwhile. pub async fn scan_safekeeper_metadata( bucket_config: BucketConfig, - tenant_ids: Vec, - dump_db_connstr: String, - dump_db_table: String, + db_or_list: DatabaseOrList, ) -> anyhow::Result { info!( - "checking bucket {}, region {}, dump_db_table {}", - bucket_config.bucket, bucket_config.region, dump_db_table + "checking bucket {}, region {}", + bucket_config.bucket, bucket_config.region ); - // Use rustls (Neon requires TLS) - let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone(); - let client_config = rustls::ClientConfig::builder() - .with_root_certificates(root_store) - .with_no_client_auth(); - let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config); - let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?; - // The connection object performs the actual communication with the database, - // so spawn it off to run on its own. - tokio::spawn(async move { - if let Err(e) = connection.await { - eprintln!("connection error: {}", e); - } - }); - - let tenant_filter_clause = if !tenant_ids.is_empty() { - format!( - "and tenant_id in ({})", - tenant_ids - .iter() - .map(|t| format!("'{}'", t)) - .collect::>() - .join(", ") - ) - } else { - "".to_owned() - }; - let query = format!( - "select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) from \"{}\" where not is_cancelled {} group by tenant_id, timeline_id;", - dump_db_table, tenant_filter_clause, - ); - info!("query is {}", query); - let timelines = client.query(&query, &[]).await?; - info!("loaded {} timelines", timelines.len()); let (remote_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?; let console_config = ConsoleConfig::from_env()?; let cloud_admin_api_client = CloudAdminApiClient::new(console_config); - let checks = futures::stream::iter(timelines.iter().map(Ok)).map_ok(|row| { - let tenant_id = TenantId::from_str(row.get(0)).expect("failed to parse tenant_id"); - let timeline_id = TimelineId::from_str(row.get(1)).expect("failed to parse tenant_id"); - let timeline_start_lsn_pg: PgLsn = row.get(2); - let timeline_start_lsn: Lsn = Lsn(u64::from(timeline_start_lsn_pg)); - let backup_lsn_pg: PgLsn = row.get(3); - let backup_lsn: Lsn = Lsn(u64::from(backup_lsn_pg)); + let timelines = match db_or_list { + DatabaseOrList::Database { + tenant_ids, + connstr, + table, + } => load_timelines_from_db(tenant_ids, connstr, table).await?, + DatabaseOrList::List(list) => list, + }; + info!("loaded {} timelines", timelines.len()); + + let checks = futures::stream::iter(timelines.into_iter().map(Ok)).map_ok(|timeline| { + let tenant_id = TenantId::from_str(&timeline.tenant_id).expect("failed to parse tenant_id"); + let timeline_id = + TimelineId::from_str(&timeline.timeline_id).expect("failed to parse tenant_id"); let ttid = TenantTimelineId::new(tenant_id, timeline_id); check_timeline( &remote_client, &target, &cloud_admin_api_client, ttid, - timeline_start_lsn, - backup_lsn, + timeline.timeline_start_lsn, + timeline.backup_lsn, ) }); // Run multiple check_timeline's concurrently. @@ -163,11 +151,9 @@ async fn check_timeline( timeline_start_lsn: Lsn, backup_lsn: Lsn, ) -> anyhow::Result { - trace!( + debug!( "checking ttid {}, should contain WAL [{}-{}]", - ttid, - timeline_start_lsn, - backup_lsn + ttid, timeline_start_lsn, backup_lsn ); // calculate expected segfiles let expected_first_segno = timeline_start_lsn.segment_number(WAL_SEGSIZE); @@ -177,7 +163,7 @@ async fn check_timeline( .map(|segno| XLogFileName(PG_TLI, segno, WAL_SEGSIZE)), ); let expected_files_num = expected_segfiles.len(); - trace!("expecting {} files", expected_segfiles.len(),); + debug!("expecting {} files", expected_segfiles.len(),); // now list s3 and check if it misses something let ttshid = @@ -252,3 +238,65 @@ fn load_certs() -> Result, std::io::Error> { Ok(Arc::new(store)) } static TLS_ROOTS: OnceCell> = OnceCell::new(); + +async fn load_timelines_from_db( + tenant_ids: Vec, + dump_db_connstr: String, + dump_db_table: String, +) -> anyhow::Result> { + info!("loading from table {dump_db_table}"); + + // Use rustls (Neon requires TLS) + let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone(); + let client_config = rustls::ClientConfig::builder() + .with_root_certificates(root_store) + .with_no_client_auth(); + let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config); + let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?; + // The connection object performs the actual communication with the database, + // so spawn it off to run on its own. + tokio::spawn(async move { + if let Err(e) = connection.await { + eprintln!("connection error: {}", e); + } + }); + + let tenant_filter_clause = if !tenant_ids.is_empty() { + format!( + "and tenant_id in ({})", + tenant_ids + .iter() + .map(|t| format!("'{}'", t)) + .collect::>() + .join(", ") + ) + } else { + "".to_owned() + }; + let query = format!( + "select tenant_id, timeline_id, min(timeline_start_lsn), max(backup_lsn) \ + from \"{dump_db_table}\" \ + where not is_cancelled {tenant_filter_clause} \ + group by tenant_id, timeline_id;" + ); + info!("query is {}", query); + let timelines = client.query(&query, &[]).await?; + + let timelines = timelines + .into_iter() + .map(|row| { + let tenant_id = row.get(0); + let timeline_id = row.get(1); + let timeline_start_lsn_pg: PgLsn = row.get(2); + let backup_lsn_pg: PgLsn = row.get(3); + + TimelineLsnData { + tenant_id, + timeline_id, + timeline_start_lsn: Lsn(u64::from(timeline_start_lsn_pg)), + backup_lsn: Lsn(u64::from(backup_lsn_pg)), + } + }) + .collect::>(); + Ok(timelines) +} diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 5fe544b3bd..7c4a8db36f 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -1,3 +1,5 @@ +import os +import time from abc import ABC, abstractmethod from contextlib import _GeneratorContextManager, contextmanager @@ -8,6 +10,7 @@ import pytest from _pytest.fixtures import FixtureRequest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker +from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, PgBin, @@ -102,7 +105,6 @@ class NeonCompare(PgCompare): zenbenchmark: NeonBenchmarker, neon_simple_env: NeonEnv, pg_bin: PgBin, - branch_name: str, ): self.env = neon_simple_env self._zenbenchmark = zenbenchmark @@ -110,16 +112,11 @@ class NeonCompare(PgCompare): self.pageserver_http_client = self.env.pageserver.http_client() # note that neon_simple_env now uses LOCAL_FS remote storage - - # Create tenant - tenant_conf: Dict[str, str] = {} - self.tenant, _ = self.env.neon_cli.create_tenant(conf=tenant_conf) - - # Create timeline - self.timeline = self.env.neon_cli.create_timeline(branch_name, tenant_id=self.tenant) + self.tenant = self.env.initial_tenant + self.timeline = self.env.initial_timeline # Start pg - self._pg = self.env.endpoints.create_start(branch_name, "main", self.tenant) + self._pg = self.env.endpoints.create_start("main", "main", self.tenant) @property def pg(self) -> PgProtocol: @@ -297,13 +294,11 @@ class RemoteCompare(PgCompare): @pytest.fixture(scope="function") def neon_compare( - request: FixtureRequest, zenbenchmark: NeonBenchmarker, pg_bin: PgBin, neon_simple_env: NeonEnv, ) -> NeonCompare: - branch_name = request.node.name - return NeonCompare(zenbenchmark, neon_simple_env, pg_bin, branch_name) + return NeonCompare(zenbenchmark, neon_simple_env, pg_bin) @pytest.fixture(scope="function") @@ -341,3 +336,26 @@ def neon_with_baseline(request: FixtureRequest) -> PgCompare: fixture = request.getfixturevalue(request.param) assert isinstance(fixture, PgCompare), f"test error: fixture {fixture} is not PgCompare" return fixture + + +@pytest.fixture(scope="function", autouse=True) +def sync_after_each_test(): + # The fixture calls `sync(2)` after each test if `SYNC_AFTER_EACH_TEST` env var is `true` + # + # In CI, `SYNC_AFTER_EACH_TEST` is set to `true` only for benchmarks (`test_runner/performance`) + # that are run on self-hosted runners because some of these tests are pretty write-heavy + # and create issues to start the processes within 10s + key = "SYNC_AFTER_EACH_TEST" + enabled = os.environ.get(key) == "true" + + yield + + if not enabled: + # regress test, or running locally + return + + start = time.time() + # we only run benches on unices, the method might not exist on windows + os.sync() + elapsed = time.time() - start + log.info(f"called sync after test {elapsed=}") diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 92febfec9b..2df45a7e0e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -24,7 +24,20 @@ from functools import cached_property, partial from itertools import chain, product from pathlib import Path from types import TracebackType -from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union, cast +from typing import ( + Any, + Callable, + Dict, + Iterable, + Iterator, + List, + Optional, + Tuple, + Type, + TypeVar, + Union, + cast, +) from urllib.parse import quote, urlparse import asyncpg @@ -90,6 +103,8 @@ from fixtures.utils import AuxFileStore as AuxFileStore # reexport from .neon_api import NeonAPI, NeonApiEndpoint +T = TypeVar("T") + """ This file contains pytest fixtures. A fixture is a test resource that can be summoned by placing its name in the test's arguments. @@ -496,6 +511,7 @@ class NeonEnvBuilder: pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None, safekeeper_extra_opts: Optional[list[str]] = None, storage_controller_port_override: Optional[int] = None, + pageserver_io_buffer_alignment: Optional[int] = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -550,6 +566,8 @@ class NeonEnvBuilder: self.storage_controller_port_override = storage_controller_port_override + self.pageserver_io_buffer_alignment = pageserver_io_buffer_alignment + assert test_name.startswith( "test_" ), "Unexpectedly instantiated from outside a test function" @@ -1123,6 +1141,7 @@ class NeonEnv: self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine self.pageserver_aux_file_policy = config.pageserver_aux_file_policy + self.pageserver_io_buffer_alignment = config.pageserver_io_buffer_alignment # Create the neon_local's `NeonLocalInitConf` cfg: Dict[str, Any] = { @@ -1160,6 +1179,8 @@ class NeonEnv: "listen_http_addr": f"localhost:{pageserver_port.http}", "pg_auth_type": pg_auth_type, "http_auth_type": http_auth_type, + # Default which can be overriden with `NeonEnvBuilder.pageserver_config_override` + "availability_zone": "us-east-2a", } if self.pageserver_virtual_file_io_engine is not None: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine @@ -1184,13 +1205,11 @@ class NeonEnv: for key, value in override.items(): ps_cfg[key] = value + ps_cfg["io_buffer_alignment"] = self.pageserver_io_buffer_alignment + # Create a corresponding NeonPageserver object self.pageservers.append( - NeonPageserver( - self, - ps_id, - port=pageserver_port, - ) + NeonPageserver(self, ps_id, port=pageserver_port, az_id=ps_cfg["availability_zone"]) ) cfg["pageservers"].append(ps_cfg) @@ -1425,6 +1444,7 @@ def _shared_simple_env( pageserver_virtual_file_io_engine: str, pageserver_aux_file_policy: Optional[AuxFileStore], pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]], + pageserver_io_buffer_alignment: Optional[int], ) -> Iterator[NeonEnv]: """ # Internal fixture backing the `neon_simple_env` fixture. If TEST_SHARED_FIXTURES @@ -1457,6 +1477,7 @@ def _shared_simple_env( pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, pageserver_aux_file_policy=pageserver_aux_file_policy, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, + pageserver_io_buffer_alignment=pageserver_io_buffer_alignment, ) as builder: env = builder.init_start() @@ -1499,6 +1520,7 @@ def neon_env_builder( pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]], pageserver_aux_file_policy: Optional[AuxFileStore], record_property: Callable[[str, object], None], + pageserver_io_buffer_alignment: Optional[int], ) -> Iterator[NeonEnvBuilder]: """ Fixture to create a Neon environment for test. @@ -1534,6 +1556,7 @@ def neon_env_builder( test_overlay_dir=test_overlay_dir, pageserver_aux_file_policy=pageserver_aux_file_policy, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, + pageserver_io_buffer_alignment=pageserver_io_buffer_alignment, ) as builder: yield builder # Propogate `preserve_database_files` to make it possible to use in other fixtures, @@ -2390,6 +2413,7 @@ class NeonStorageController(MetricsGetter, LogUtils): "listen_http_port": node.service_port.http, "listen_pg_addr": "localhost", "listen_pg_port": node.service_port.pg, + "availability_zone_id": node.az_id, } log.info(f"node_register({body})") self.request( @@ -2836,6 +2860,29 @@ class NeonStorageController(MetricsGetter, LogUtils): raise AssertionError("unreachable") + def on_safekeeper_deploy(self, id: int, body: dict[str, Any]): + self.request( + "POST", + f"{self.api}/control/v1/safekeeper/{id}", + headers=self.headers(TokenScope.ADMIN), + json=body, + ) + + def get_safekeeper(self, id: int) -> Optional[dict[str, Any]]: + try: + response = self.request( + "GET", + f"{self.api}/control/v1/safekeeper/{id}", + headers=self.headers(TokenScope.ADMIN), + ) + json = response.json() + assert isinstance(json, dict) + return json + except StorageControllerApiException as e: + if e.status_code == 404: + return None + raise e + def __enter__(self) -> "NeonStorageController": return self @@ -2913,10 +2960,11 @@ class NeonPageserver(PgProtocol, LogUtils): TEMP_FILE_SUFFIX = "___temp" - def __init__(self, env: NeonEnv, id: int, port: PageserverPort): + def __init__(self, env: NeonEnv, id: int, port: PageserverPort, az_id: str): super().__init__(host="localhost", port=port.pg, user="cloud_admin") self.env = env self.id = id + self.az_id = az_id self.running = False self.service_port = port self.version = env.get_binary_version("pageserver") @@ -2953,16 +3001,17 @@ class NeonPageserver(PgProtocol, LogUtils): def config_toml_path(self) -> Path: return self.workdir / "pageserver.toml" - def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], None]): + def edit_config_toml(self, edit_fn: Callable[[Dict[str, Any]], T]) -> T: """ Edit the pageserver's config toml file in place. """ path = self.config_toml_path with open(path, "r") as f: config = toml.load(f) - edit_fn(config) + res = edit_fn(config) with open(path, "w") as f: toml.dump(config, f) + return res def patch_config_toml_nonrecursive(self, patch: Dict[str, Any]) -> Dict[str, Any]: """ @@ -4543,6 +4592,8 @@ class Safekeeper(LogUtils): def timeline_dir(self, tenant_id, timeline_id) -> Path: return self.data_dir / str(tenant_id) / str(timeline_id) + # List partial uploaded segments of this safekeeper. Works only for + # RemoteStorageKind.LOCAL_FS. def list_uploaded_segments(self, tenant_id: TenantId, timeline_id: TimelineId): tline_path = ( self.env.repo_dir @@ -4552,9 +4603,11 @@ class Safekeeper(LogUtils): / str(timeline_id) ) assert isinstance(self.env.safekeepers_remote_storage, LocalFsStorage) - return self._list_segments_in_dir( + segs = self._list_segments_in_dir( tline_path, lambda name: ".metadata" not in name and ".___temp" not in name ) + mysegs = [s for s in segs if f"sk{self.id}" in s] + return mysegs def list_segments(self, tenant_id, timeline_id) -> List[str]: """ @@ -4615,12 +4668,20 @@ class Safekeeper(LogUtils): wait_until(20, 0.5, paused) +# TODO: Replace with `StrEnum` when we upgrade to python 3.11 +class NodeKind(str, Enum): + PAGESERVER = "pageserver" + SAFEKEEPER = "safekeeper" + + class StorageScrubber: def __init__(self, env: NeonEnv, log_dir: Path): self.env = env self.log_dir = log_dir - def scrubber_cli(self, args: list[str], timeout) -> str: + def scrubber_cli( + self, args: list[str], timeout, extra_env: Optional[Dict[str, str]] = None + ) -> str: assert isinstance(self.env.pageserver_remote_storage, S3Storage) s3_storage = self.env.pageserver_remote_storage @@ -4635,6 +4696,9 @@ class StorageScrubber: if s3_storage.endpoint is not None: env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint}) + if extra_env is not None: + env.update(extra_env) + base_args = [ str(self.env.neon_binpath / "storage_scrubber"), f"--controller-api={self.env.storage_controller.api_root()}", @@ -4662,18 +4726,43 @@ class StorageScrubber: assert stdout is not None return stdout - def scan_metadata(self, post_to_storage_controller: bool = False) -> Tuple[bool, Any]: + def scan_metadata_safekeeper( + self, + timeline_lsns: List[Dict[str, Any]], + cloud_admin_api_url: str, + cloud_admin_api_token: str, + ) -> Tuple[bool, Any]: + extra_env = { + "CLOUD_ADMIN_API_URL": cloud_admin_api_url, + "CLOUD_ADMIN_API_TOKEN": cloud_admin_api_token, + } + return self.scan_metadata( + node_kind=NodeKind.SAFEKEEPER, timeline_lsns=timeline_lsns, extra_env=extra_env + ) + + def scan_metadata( + self, + post_to_storage_controller: bool = False, + node_kind: NodeKind = NodeKind.PAGESERVER, + timeline_lsns: Optional[List[Dict[str, Any]]] = None, + extra_env: Optional[Dict[str, str]] = None, + ) -> Tuple[bool, Any]: """ Returns the health status and the metadata summary. """ - args = ["scan-metadata", "--node-kind", "pageserver", "--json"] + args = ["scan-metadata", "--node-kind", node_kind.value, "--json"] if post_to_storage_controller: args.append("--post") - stdout = self.scrubber_cli(args, timeout=30) + if timeline_lsns is not None: + args.append("--timeline-lsns") + args.append(json.dumps(timeline_lsns)) + stdout = self.scrubber_cli(args, timeout=30, extra_env=extra_env) try: summary = json.loads(stdout) - healthy = not summary["with_errors"] and not summary["with_warnings"] + # summary does not contain "with_warnings" if node_kind is the safekeeper + no_warnings = "with_warnings" not in summary or not summary["with_warnings"] + healthy = not summary["with_errors"] and no_warnings return healthy, summary except: log.error("Failed to decode JSON output from `scan-metadata`. Dumping stdout:") diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index 92c98763e3..e2dd51802c 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -34,6 +34,11 @@ def pageserver_virtual_file_io_engine() -> Optional[str]: return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE") +@pytest.fixture(scope="function", autouse=True) +def pageserver_io_buffer_alignment() -> Optional[int]: + return None + + @pytest.fixture(scope="function", autouse=True) def pageserver_aux_file_policy() -> Optional[AuxFileStore]: return None diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py index 05b43cfb72..9bf03554e7 100644 --- a/test_runner/fixtures/safekeeper/http.py +++ b/test_runner/fixtures/safekeeper/http.py @@ -174,6 +174,22 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json + def debug_dump_timeline( + self, timeline_id: TimelineId, params: Optional[Dict[str, str]] = None + ) -> Any: + params = params or {} + params["timeline_id"] = str(timeline_id) + dump = self.debug_dump(params) + return dump["timelines"][0] + + def get_partial_backup(self, timeline_id: TimelineId) -> Any: + dump = self.debug_dump_timeline(timeline_id, {"dump_control_file": "true"}) + return dump["control_file"]["partial_backup"] + + def get_eviction_state(self, timeline_id: TimelineId) -> Any: + dump = self.debug_dump_timeline(timeline_id, {"dump_control_file": "true"}) + return dump["control_file"]["eviction_state"] + def pull_timeline(self, body: Dict[str, Any]) -> Dict[str, Any]: res = self.post(f"http://localhost:{self.port}/v1/pull_timeline", json=body) res.raise_for_status() @@ -228,6 +244,14 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter): assert isinstance(res_json, dict) return res_json + def backup_partial_reset(self, tenant_id: TenantId, timeline_id: TimelineId): + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/backup_partial_reset", + json={}, + ) + res.raise_for_status() + return res.json() + def record_safekeeper_info(self, tenant_id: TenantId, timeline_id: TimelineId, body): res = self.post( f"http://localhost:{self.port}/v1/record_safekeeper_info/{tenant_id}/{timeline_id}", diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py index 513ebc74c3..c824e60c29 100644 --- a/test_runner/performance/test_wal_backpressure.py +++ b/test_runner/performance/test_wal_backpressure.py @@ -2,14 +2,14 @@ import statistics import threading import time import timeit -from typing import Any, Callable, List +from typing import Any, Callable, Generator, List import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.common_types import Lsn from fixtures.compare_fixtures import NeonCompare, PgCompare, VanillaCompare from fixtures.log_helper import log -from fixtures.neon_fixtures import DEFAULT_BRANCH_NAME, NeonEnvBuilder, PgBin +from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, flush_ep_to_pageserver from performance.test_perf_pgbench import get_durations_matrix, get_scales_matrix @@ -20,7 +20,7 @@ from performance.test_perf_pgbench import get_durations_matrix, get_scales_matri # For example, to build a `NeonCompare` interface, the corresponding fixture's param should have # a format of `neon_{safekeepers_enable_fsync}`. # Note that, here "_" is used to separate builder parameters. -def pg_compare(request) -> PgCompare: +def pg_compare(request) -> Generator[PgCompare, None, None]: x = request.param.split("_") if x[0] == "vanilla": @@ -28,7 +28,7 @@ def pg_compare(request) -> PgCompare: fixture = request.getfixturevalue("vanilla_compare") assert isinstance(fixture, VanillaCompare) - return fixture + yield fixture else: assert ( len(x) == 2 @@ -47,10 +47,15 @@ def pg_compare(request) -> PgCompare: neon_env_builder.safekeepers_enable_fsync = x[1] == "on" env = neon_env_builder.init_start() - env.neon_cli.create_branch("empty", ancestor_branch_name=DEFAULT_BRANCH_NAME) - branch_name = request.node.name - return NeonCompare(zenbenchmark, env, pg_bin, branch_name) + cmp = NeonCompare(zenbenchmark, env, pg_bin) + + yield cmp + + flush_ep_to_pageserver(env, cmp._pg, cmp.tenant, cmp.timeline) + env.pageserver.http_client().timeline_checkpoint( + cmp.tenant, cmp.timeline, compact=False, wait_until_uploaded=True + ) def start_heavy_write_workload(env: PgCompare, n_tables: int, scale: int, num_iters: int): diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index a7eda73d4c..bb337d9cc1 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -162,7 +162,6 @@ def test_fully_custom_config(positive_env: NeonEnv): "min_resident_size_override": 23, "timeline_get_throttle": { "task_kinds": ["PageRequestHandler"], - "fair": True, "initial": 0, "refill_interval": "1s", "refill_amount": 1000, diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index c361efe90a..467e5b1734 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -149,6 +149,10 @@ def test_create_snapshot( ) +# check_neon_works does recovery from WAL => the compatibility snapshot's WAL is old => will log this warning +ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*" + + @check_ondisk_data_compatibility_if_enabled @pytest.mark.xdist_group("compatibility") @pytest.mark.order(after="test_create_snapshot") @@ -173,6 +177,7 @@ def test_backward_compatibility( try: neon_env_builder.num_safekeepers = 3 env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo") + env.pageserver.allowed_errors.append(ingest_lag_log_line) neon_env_builder.start() check_neon_works( @@ -181,6 +186,9 @@ def test_backward_compatibility( sql_dump_path=compatibility_snapshot_dir / "dump.sql", repo_dir=env.repo_dir, ) + + env.pageserver.assert_log_contains(ingest_lag_log_line) + except Exception: if breaking_changes_allowed: pytest.xfail( @@ -238,6 +246,8 @@ def test_forward_compatibility( env = neon_env_builder.from_repo_dir( compatibility_snapshot_dir / "repo", ) + # there may be an arbitrary number of unrelated tests run between create_snapshot and here + env.pageserver.allowed_errors.append(ingest_lag_log_line) # not using env.pageserver.version because it was initialized before prev_pageserver_version_str = env.get_binary_version("pageserver") diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 73af7950f1..ebf58d2bd1 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -142,11 +142,10 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): # We will start a pageserver with no control_plane_api set, so it won't be able to self-register env.storage_controller.node_register(env.pageserver) - replaced_config = env.pageserver.patch_config_toml_nonrecursive( - { - "control_plane_api": "", - } - ) + def remove_control_plane_api_field(config): + return config.pop("control_plane_api") + + control_plane_api = env.pageserver.edit_config_toml(remove_control_plane_api_field) env.pageserver.start() env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"}) @@ -179,7 +178,11 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): env.pageserver.stop() # Starting without the override that disabled control_plane_api - env.pageserver.patch_config_toml_nonrecursive(replaced_config) + env.pageserver.patch_config_toml_nonrecursive( + { + "control_plane_api": control_plane_api, + } + ) env.pageserver.start() generate_uploads_and_deletions(env, pageserver=env.pageserver, init=False) diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py index 111285b40c..4c9eac5cd7 100644 --- a/test_runner/regress/test_pageserver_getpage_throttle.py +++ b/test_runner/regress/test_pageserver_getpage_throttle.py @@ -1,3 +1,4 @@ +import copy import json import uuid @@ -116,3 +117,58 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P assert ( duration_secs >= 10 * actual_smgr_query_seconds ), "smgr metrics should not include throttle wait time" + + +throttle_config_with_field_fair_set = { + "task_kinds": ["PageRequestHandler"], + "fair": True, + "initial": 27, + "refill_interval": "43s", + "refill_amount": 23, + "max": 42, +} + + +def assert_throttle_config_with_field_fair_set(conf): + """ + Field `fair` is ignored, so, responses don't contain it + """ + without_fair = copy.deepcopy(throttle_config_with_field_fair_set) + without_fair.pop("fair") + + assert conf == without_fair + + +def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_builder: NeonEnvBuilder): + """ + To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out. + """ + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + # with_fair config should still be settable + ps_http.set_tenant_config( + env.initial_tenant, + {"timeline_get_throttle": throttle_config_with_field_fair_set}, + ) + conf = ps_http.tenant_config(env.initial_tenant) + assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"]) + assert_throttle_config_with_field_fair_set( + conf.tenant_specific_overrides["timeline_get_throttle"] + ) + + +def test_throttle_fair_config_is_settable_but_ignored_in_config_toml( + neon_env_builder: NeonEnvBuilder, +): + """ + To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out. + """ + + def set_tenant_config(ps_cfg): + ps_cfg["tenant_config"] = {"timeline_get_throttle": throttle_config_with_field_fair_set} + + neon_env_builder.pageserver_config_override = set_tenant_config + env = neon_env_builder.init_start() + ps_http = env.pageserver.http_client() + conf = ps_http.tenant_config(env.initial_tenant) + assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"]) diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index 66b6185aaa..f6404d68ac 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -247,9 +247,10 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder): compaction_period_s = 10 + checkpoint_distance = 1024**3 tenant_conf = { # Large space + time thresholds: effectively disable these limits - "checkpoint_distance": f"{1024 ** 4}", + "checkpoint_distance": f"{checkpoint_distance}", "checkpoint_timeout": "3600s", "compaction_period": f"{compaction_period_s}s", } @@ -269,7 +270,11 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder): for tenant, timeline, last_flush_lsn in last_flush_lsns: http_client = env.pageserver.http_client() initdb_lsn = Lsn(http_client.timeline_detail(tenant, timeline)["initdb_lsn"]) - total_bytes_ingested += last_flush_lsn - initdb_lsn + this_timeline_ingested = last_flush_lsn - initdb_lsn + assert ( + this_timeline_ingested < checkpoint_distance * 0.8 + ), "this test is supposed to fill InMemoryLayer" + total_bytes_ingested += this_timeline_ingested log.info(f"Ingested {total_bytes_ingested} bytes since initdb (vs max dirty {max_dirty_data})") assert total_bytes_ingested > max_dirty_data diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py index ba8b91e84d..368f60127e 100644 --- a/test_runner/regress/test_readonly_node.py +++ b/test_runner/regress/test_readonly_node.py @@ -1,7 +1,15 @@ +import time + import pytest from fixtures.common_types import Lsn from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv +from fixtures.neon_fixtures import ( + Endpoint, + NeonEnv, + NeonEnvBuilder, + last_flush_lsn_upload, + tenant_get_shards, +) from fixtures.pageserver.utils import wait_for_last_record_lsn from fixtures.utils import query_scalar @@ -17,7 +25,12 @@ def test_readonly_node(neon_simple_env: NeonEnv): env.neon_cli.create_branch("test_readonly_node", "empty") endpoint_main = env.endpoints.create_start("test_readonly_node") - env.pageserver.allowed_errors.append(".*basebackup .* failed: invalid basebackup lsn.*") + env.pageserver.allowed_errors.extend( + [ + ".*basebackup .* failed: invalid basebackup lsn.*", + ".*page_service.*handle_make_lsn_lease.*.*tried to request a page version that was garbage collected", + ] + ) main_pg_conn = endpoint_main.connect() main_cur = main_pg_conn.cursor() @@ -105,6 +118,103 @@ def test_readonly_node(neon_simple_env: NeonEnv): ) +def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder): + """ + Test static endpoint is protected from GC by acquiring and renewing lsn leases. + """ + + neon_env_builder.num_pageservers = 2 + # GC is manual triggered. + env = neon_env_builder.init_start( + initial_tenant_conf={ + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": f"{128 * 1024}", + "compaction_threshold": "1", + "compaction_target_size": f"{128 * 1024}", + # no PITR horizon, we specify the horizon when we request on-demand GC + "pitr_interval": "0s", + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + # create image layers eagerly, so that GC can remove some layers + "image_creation_threshold": "1", + "image_layer_creation_check_threshold": "0", + # Short lease length to fit test. + "lsn_lease_length": "3s", + }, + initial_tenant_shard_count=2, + ) + + ROW_COUNT = 500 + + def generate_updates_on_main( + env: NeonEnv, + ep_main: Endpoint, + data: int, + start=1, + end=ROW_COUNT, + ) -> Lsn: + """ + Generates some load on main branch that results in some uploads. + """ + with ep_main.cursor() as cur: + cur.execute( + f"INSERT INTO t0 (v0, v1) SELECT g, '{data}' FROM generate_series({start}, {end}) g ON CONFLICT (v0) DO UPDATE SET v1 = EXCLUDED.v1" + ) + cur.execute("VACUUM t0") + last_flush_lsn = last_flush_lsn_upload( + env, ep_main, env.initial_tenant, env.initial_timeline + ) + return last_flush_lsn + + # Insert some records on main branch + with env.endpoints.create_start("main") as ep_main: + with ep_main.cursor() as cur: + cur.execute("CREATE TABLE t0(v0 int primary key, v1 text)") + lsn = None + for i in range(2): + lsn = generate_updates_on_main(env, ep_main, i) + + with env.endpoints.create_start( + branch_name="main", + endpoint_id="static", + lsn=lsn, + ) as ep_static: + with ep_static.cursor() as cur: + cur.execute("SELECT count(*) FROM t0") + assert cur.fetchone() == (ROW_COUNT,) + + time.sleep(3) + + generate_updates_on_main(env, ep_main, i, end=100) + + # Trigger GC + for shard, ps in tenant_get_shards(env, env.initial_tenant): + client = ps.http_client() + gc_result = client.timeline_gc(shard, env.initial_timeline, 0) + log.info(f"{gc_result=}") + + assert ( + gc_result["layers_removed"] == 0 + ), "No layers should be removed, old layers are guarded by leases." + + with ep_static.cursor() as cur: + cur.execute("SELECT count(*) FROM t0") + assert cur.fetchone() == (ROW_COUNT,) + + # Do some update so we can increment latest_gc_cutoff + generate_updates_on_main(env, ep_main, i, end=100) + + # Now trigger GC again, layers should be removed. + time.sleep(4) + for shard, ps in tenant_get_shards(env, env.initial_tenant): + client = ps.http_client() + gc_result = client.timeline_gc(shard, env.initial_timeline, 0) + log.info(f"{gc_result=}") + + assert gc_result["layers_removed"] > 0, "Old layers should be removed after leases expired." + + # Similar test, but with more data, and we force checkpoints def test_timetravel(neon_simple_env: NeonEnv): env = neon_simple_env diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 03eb7628be..8da42294b0 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -31,7 +31,7 @@ from fixtures.pageserver.utils import ( remote_storage_delete_key, timeline_delete_wait_completed, ) -from fixtures.pg_version import PgVersion +from fixtures.pg_version import PgVersion, run_only_on_default_postgres from fixtures.port_distributor import PortDistributor from fixtures.remote_storage import RemoteStorageKind, s3_storage from fixtures.storage_controller_proxy import StorageControllerProxy @@ -2330,3 +2330,185 @@ def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder) connect=0, # Disable retries: we want to see the 503 ) ).timeline_create(PgVersion.NOT_SET, tenant_id, create_timeline_id) + + +def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvBuilder): + """ + A correctness edge case: while we are live migrating and a shard's generation is + visible to the Reconciler but not to the central Service, the generation validation + API should still prevent stale generations from doing deletions. + """ + neon_env_builder.num_pageservers = 2 + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + env = neon_env_builder.init_configs() + env.start() + + TENANT_CONF = { + # small checkpointing and compaction targets to ensure we generate many upload operations + "checkpoint_distance": 128 * 1024, + "compaction_threshold": 1, + "compaction_target_size": 128 * 1024, + # disable background compaction and GC. We invoke it manually when we want it to happen. + "gc_period": "0s", + "compaction_period": "0s", + } + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + env.neon_cli.create_tenant(tenant_id, timeline_id) + env.storage_controller.pageserver_api().set_tenant_config(tenant_id, TENANT_CONF) + + # Write enough data that a compaction would do some work (deleting some L0s) + workload = Workload(env, tenant_id, timeline_id) + workload.init() + workload.write_rows(64) + for _i in range(0, 2): + workload.churn_rows(64, upload=False) + + # Upload but don't compact + origin_pageserver = env.get_tenant_pageserver(tenant_id) + dest_ps_id = [p.id for p in env.pageservers if p.id != origin_pageserver.id][0] + origin_pageserver.http_client().timeline_checkpoint( + tenant_id, timeline_id, wait_until_uploaded=True, compact=False + ) + + # Start a compaction that will pause on a failpoint. + compaction_failpoint = "before-upload-index-pausable" + origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "pause")) + + # This failpoint can also cause migration code to time out trying to politely flush + # during migrations + origin_pageserver.allowed_errors.append(".*Timed out waiting for flush to remote storage.*") + + try: + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + compact_fut = executor.submit( + origin_pageserver.http_client().timeline_compact, + tenant_id, + timeline_id, + wait_until_uploaded=True, + ) + + # Let the compaction start and then get stuck uploading an index: when we live migrate, the new generation's + # index will be initialized from the pre-compaction index, referencing layers that the compaction will try to delete + def has_hit_compaction_failpoint(): + assert origin_pageserver.log_contains(f"at failpoint {compaction_failpoint}") + + wait_until(10, 1, has_hit_compaction_failpoint) + + # While the compaction is running, start a live migration which will pause long enough for the compaction to sleep, + # after incrementing generation and attaching the new location + migration_failpoint = "reconciler-live-migrate-post-notify" + env.storage_controller.configure_failpoints((migration_failpoint, "pause")) + migrate_fut = executor.submit( + env.storage_controller.tenant_shard_migrate, + TenantShardId(tenant_id, 0, 0), + dest_ps_id, + ) + + def has_hit_migration_failpoint(): + assert env.storage_controller.log_contains(f"at failpoint {migration_failpoint}") + + # Long wait because the migration will have to time out during transition to AttachedStale + # before it reaches this point. The timeout is because the AttachedStale transition includes + # a flush of remote storage, and if the compaction already enqueued an index upload this cannot + # make progress. + wait_until(60, 1, has_hit_migration_failpoint) + + # Origin pageserver has succeeded with compaction before the migration completed. It has done all the writes it wanted to do in its own (stale) generation + origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off")) + compact_fut.result() + origin_pageserver.http_client().deletion_queue_flush(execute=True) + + # Eventually migration completes + env.storage_controller.configure_failpoints((migration_failpoint, "off")) + migrate_fut.result() + except: + # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown + env.storage_controller.configure_failpoints((migration_failpoint, "off")) + origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off")) + raise + + # Ensure the destination of the migration writes an index, so that if it has corrupt state that is + # visible to the scrubber. + workload.write_rows(1, upload=False) + env.get_pageserver(dest_ps_id).http_client().timeline_checkpoint( + tenant_id, timeline_id, wait_until_uploaded=True, compact=False + ) + + # The destination of the live migration would now have a corrupt index (referencing deleted L0s) if + # the controller had not properly applied validation rules. + healthy, _summary = env.storage_scrubber.scan_metadata() + try: + log.info(f"scrubbed, healthy={healthy}") + assert healthy + except: + # On failures, we want to report them FAIL during the test, not as ERROR during teardown + neon_env_builder.enable_scrub_on_exit = False + raise + + +@run_only_on_default_postgres("this is like a 'unit test' against storcon db") +def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_configs() + env.start() + + fake_id = 5 + + target = env.storage_controller + + assert target.get_safekeeper(fake_id) is None + + body = { + "active": True, + "id": fake_id, + "created_at": "2023-10-25T09:11:25Z", + "updated_at": "2024-08-28T11:32:43Z", + "region_id": "aws-us-east-2", + "host": "safekeeper-333.us-east-2.aws.neon.build", + "port": 6401, + "http_port": 7676, + "version": 5957, + "availability_zone_id": "us-east-2b", + } + + target.on_safekeeper_deploy(fake_id, body) + + inserted = target.get_safekeeper(fake_id) + assert inserted is not None + assert eq_safekeeper_records(body, inserted) + + # error out if pk is changed (unexpected) + with pytest.raises(StorageControllerApiException) as exc: + different_pk = dict(body) + different_pk["id"] = 4 + assert different_pk["id"] != body["id"] + target.on_safekeeper_deploy(fake_id, different_pk) + assert exc.value.status_code == 400 + + inserted_again = target.get_safekeeper(fake_id) + assert inserted_again is not None + assert eq_safekeeper_records(inserted, inserted_again) + + # the most common case, version goes up: + assert isinstance(body["version"], int) + body["version"] += 1 + target.on_safekeeper_deploy(fake_id, body) + inserted_now = target.get_safekeeper(fake_id) + assert inserted_now is not None + + assert eq_safekeeper_records(body, inserted_now) + + +def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool: + compared = [dict(a), dict(b)] + + masked_keys = ["created_at", "updated_at"] + + for d in compared: + # keep deleting these in case we are comparing the body as it will be uploaded by real scripts + for key in masked_keys: + if key in d: + del d[key] + + return compared[0] == compared[1] diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index 292a9a1010..848e214c5e 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -217,6 +217,13 @@ def test_scrubber_physical_gc_ancestors( workload.init() workload.write_rows(100) + # Issue a deletion queue flush so that the parent shard can't leave behind layers + # that will look like unexpected garbage to the scrubber + for pre_split_shard in env.storage_controller.locate(tenant_id): + env.get_pageserver(pre_split_shard["node_id"]).http_client().deletion_queue_flush( + execute=True + ) + new_shard_count = 4 assert shard_count is None or new_shard_count > shard_count shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) @@ -321,6 +328,10 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder workload.write_rows(100, upload=False) workload.stop() + # Issue a deletion queue flush so that the parent shard can't leave behind layers + # that will look like unexpected garbage to the scrubber + env.get_tenant_pageserver(tenant_id).http_client().deletion_queue_flush(execute=True) + new_shard_count = 4 shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count) for shard in shards: diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 448a28dc31..7ee949e8d3 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -1,7 +1,9 @@ +import json from threading import Thread import pytest from fixtures.common_types import Lsn, TenantId, TimelineId +from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, @@ -17,6 +19,8 @@ from fixtures.pageserver.utils import ( from fixtures.remote_storage import RemoteStorageKind, s3_storage from fixtures.utils import run_pg_bench_small, wait_until from requests.exceptions import ReadTimeout +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response def error_tolerant_delete(ps_http, tenant_id): @@ -322,7 +326,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder) env.pageserver.stop() -def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder): +def test_tenant_delete_scrubber(pg_bin: PgBin, make_httpserver, neon_env_builder: NeonEnvBuilder): """ Validate that creating and then deleting the tenant both survives the scrubber, and that one can run the scrubber without problems. @@ -347,6 +351,45 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder) healthy, _ = env.storage_scrubber.scan_metadata() assert healthy + timeline_lsns = { + "tenant_id": f"{tenant_id}", + "timeline_id": f"{timeline_id}", + "timeline_start_lsn": f"{last_flush_lsn}", + "backup_lsn": f"{last_flush_lsn}", + } + + cloud_admin_url = f"http://{make_httpserver.host}:{make_httpserver.port}/" + cloud_admin_token = "" + + def get_branches(request: Request): + # Compare definition with `BranchData` struct + dummy_data = { + "id": "test-branch-id", + "created_at": "", # TODO + "updated_at": "", # TODO + "name": "testbranchname", + "project_id": "test-project-id", + "timeline_id": f"{timeline_id}", + "default": False, + "deleted": False, + "logical_size": 42000, + "physical_size": 42000, + "written_size": 42000, + } + # This test does all its own compute configuration (by passing explicit pageserver ID to Workload functions), + # so we send controller notifications to /dev/null to prevent it fighting the test for control of the compute. + log.info(f"got get_branches request: {request.json}") + return Response(json.dumps(dummy_data), content_type="application/json", status=200) + + make_httpserver.expect_request("/branches", method="GET").respond_with_handler(get_branches) + + healthy, _ = env.storage_scrubber.scan_metadata_safekeeper( + timeline_lsns=[timeline_lsns], + cloud_admin_api_url=cloud_admin_url, + cloud_admin_api_token=cloud_admin_token, + ) + assert healthy + env.start() ps_http = env.pageserver.http_client() ps_http.tenant_delete(tenant_id) @@ -354,3 +397,10 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder) healthy, _ = env.storage_scrubber.scan_metadata() assert healthy + + healthy, _ = env.storage_scrubber.scan_metadata_safekeeper( + timeline_lsns=[timeline_lsns], + cloud_admin_api_url=cloud_admin_url, + cloud_admin_api_token=cloud_admin_token, + ) + assert healthy diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 0ebf714de0..b63ff7f6bd 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -372,8 +372,10 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): tenant_id: TenantId = env.initial_tenant timeline_id = env.initial_timeline - # Multiple creation requests which race will generate this error + # Multiple creation requests which race will generate this error on the pageserver + # and storage controller respectively env.pageserver.allowed_errors.append(".*Conflict: Tenant is already being modified.*") + env.storage_controller.allowed_errors.append(".*Conflict: Tenant is already being modified.*") # Tenant creation requests which arrive out of order will generate complaints about # generation nubmers out of order. diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index b774c7c9fe..de43e51c9e 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -1,96 +1,113 @@ import pytest from fixtures.common_types import TenantId, TimelineArchivalState, TimelineId from fixtures.neon_fixtures import ( - NeonEnv, + NeonEnvBuilder, ) from fixtures.pageserver.http import PageserverApiException -def test_timeline_archive(neon_simple_env: NeonEnv): - env = neon_simple_env +@pytest.mark.parametrize("shard_count", [0, 4]) +def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int): + unsharded = shard_count == 0 + if unsharded: + env = neon_env_builder.init_start() + # If we run the unsharded version, talk to the pageserver directly + ps_http = env.pageserver.http_client() + else: + neon_env_builder.num_pageservers = shard_count + env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + # If we run the unsharded version, talk to the storage controller + ps_http = env.storage_controller.pageserver_api() - env.pageserver.allowed_errors.extend( - [ - ".*Timeline .* was not found.*", - ".*timeline not found.*", - ".*Cannot archive timeline which has unarchived child timelines.*", - ".*Precondition failed: Requested tenant is missing.*", - ] - ) - - ps_http = env.pageserver.http_client() - - # first try to archive non existing timeline - # for existing tenant: + # first try to archive a non existing timeline for an existing tenant: invalid_timeline_id = TimelineId.generate() with pytest.raises(PageserverApiException, match="timeline not found") as exc: ps_http.timeline_archival_config( - tenant_id=env.initial_tenant, - timeline_id=invalid_timeline_id, + env.initial_tenant, + invalid_timeline_id, state=TimelineArchivalState.ARCHIVED, ) assert exc.value.status_code == 404 - # for non existing tenant: + # for a non existing tenant: invalid_tenant_id = TenantId.generate() with pytest.raises( PageserverApiException, - match=f"NotFound: tenant {invalid_tenant_id}", + match="NotFound: [tT]enant", ) as exc: ps_http.timeline_archival_config( - tenant_id=invalid_tenant_id, - timeline_id=invalid_timeline_id, + invalid_tenant_id, + invalid_timeline_id, state=TimelineArchivalState.ARCHIVED, ) assert exc.value.status_code == 404 - # construct pair of branches to validate that pageserver prohibits + # construct a pair of branches to validate that pageserver prohibits # archival of ancestor timelines when they have non-archived child branches - parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_archive_parent", "empty") + parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_archive_parent") leaf_timeline_id = env.neon_cli.create_branch( "test_ancestor_branch_archive_branch1", "test_ancestor_branch_archive_parent" ) - timeline_path = env.pageserver.timeline_dir(env.initial_tenant, parent_timeline_id) - with pytest.raises( PageserverApiException, match="Cannot archive timeline which has non-archived child timelines", ) as exc: - assert timeline_path.exists() - ps_http.timeline_archival_config( - tenant_id=env.initial_tenant, - timeline_id=parent_timeline_id, + env.initial_tenant, + parent_timeline_id, state=TimelineArchivalState.ARCHIVED, ) assert exc.value.status_code == 412 - # Test timeline_detail leaf_detail = ps_http.timeline_detail( - tenant_id=env.initial_tenant, + env.initial_tenant, timeline_id=leaf_timeline_id, ) assert leaf_detail["is_archived"] is False # Test that archiving the leaf timeline and then the parent works ps_http.timeline_archival_config( - tenant_id=env.initial_tenant, - timeline_id=leaf_timeline_id, + env.initial_tenant, + leaf_timeline_id, state=TimelineArchivalState.ARCHIVED, ) leaf_detail = ps_http.timeline_detail( - tenant_id=env.initial_tenant, - timeline_id=leaf_timeline_id, + env.initial_tenant, + leaf_timeline_id, ) assert leaf_detail["is_archived"] is True ps_http.timeline_archival_config( - tenant_id=env.initial_tenant, - timeline_id=parent_timeline_id, + env.initial_tenant, + parent_timeline_id, state=TimelineArchivalState.ARCHIVED, ) + + # Test that the leaf can't be unarchived + with pytest.raises( + PageserverApiException, + match="ancestor is archived", + ) as exc: + ps_http.timeline_archival_config( + env.initial_tenant, + leaf_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + + # Unarchive works for the leaf if the parent gets unarchived first + ps_http.timeline_archival_config( + env.initial_tenant, + parent_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + + ps_http.timeline_archival_config( + env.initial_tenant, + leaf_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index 642b9e449b..9bf5f8680b 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -733,7 +733,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): # We will run with the limit set to 1, so that once we have one tenant stuck # in a pausable failpoint, the rest are prevented from proceeding through warmup. - neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1" env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() @@ -984,7 +984,7 @@ def test_timeline_logical_size_task_priority(neon_env_builder: NeonEnvBuilder): def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder): - neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1" env = neon_env_builder.init_start() @@ -1062,7 +1062,7 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder): @pytest.mark.parametrize("activation_method", ["endpoint", "branch", "delete"]) def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_method: str): # env.initial_tenant will take up this permit when attaching with lazy because of a failpoint activated after restart - neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = '1'" + neon_env_builder.pageserver_config_override = "concurrent_tenant_warmup = 1" env = neon_env_builder.init_start() diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 19df834b81..3785651aed 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -72,6 +72,17 @@ def wait_lsn_force_checkpoint( wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options) +def wait_lsn_force_checkpoint_at_sk( + safekeeper: Safekeeper, + tenant_id: TenantId, + timeline_id: TimelineId, + ps: NeonPageserver, + pageserver_conn_options=None, +): + sk_flush_lsn = safekeeper.get_flush_lsn(tenant_id, timeline_id) + wait_lsn_force_checkpoint_at(sk_flush_lsn, tenant_id, timeline_id, ps, pageserver_conn_options) + + def wait_lsn_force_checkpoint_at( lsn: Lsn, tenant_id: TenantId, @@ -79,6 +90,10 @@ def wait_lsn_force_checkpoint_at( ps: NeonPageserver, pageserver_conn_options=None, ): + """ + Wait until pageserver receives given lsn, force checkpoint and wait for + upload, i.e. remote_consistent_lsn advancement. + """ pageserver_conn_options = pageserver_conn_options or {} auth_token = None @@ -2330,6 +2345,77 @@ def test_s3_eviction( assert event_metrics_seen +# Test resetting uploaded partial segment state. +def test_backup_partial_reset(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 1 + neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) + # We want to upload/evict quickly, but not too quickly to check that s3 is + # empty before next round of upload happens. + # Note: this test fails with --delete-offloaded-wal, this is expected. + neon_env_builder.safekeeper_extra_opts = [ + "--enable-offload", + "--partial-backup-timeout", + "1s", + "--control-file-save-interval", + "1s", + "--eviction-min-resident=1s", + ] + # XXX: pageserver currently connects to safekeeper as long as connection + # manager doesn't remove its entry (default lagging_wal_timeout is 10s), + # causing uneviction. It should be fixed to not reconnect if last + # remote_consistent_lsn is communicated and there is nothing to fetch. Make + # value lower to speed up the test. + initial_tenant_conf = { + "lagging_wal_timeout": "1s", + } + env = neon_env_builder.init_start(initial_tenant_conf) + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + endpoint = env.endpoints.create("main") + endpoint.start() + endpoint.safe_psql("create table t(key int, value text)") + endpoint.stop() + sk = env.safekeepers[0] + # eviction won't happen until remote_consistent_lsn catches up. + wait_lsn_force_checkpoint_at_sk(sk, tenant_id, timeline_id, env.pageserver) + + http_cli = env.safekeepers[0].http_client() + + # wait until eviction happens + def evicted(): + eviction_state = http_cli.get_eviction_state(timeline_id) + log.info(f"eviction_state: {eviction_state}") + if isinstance(eviction_state, str) and eviction_state == "Present": + raise Exception("eviction didn't happen yet") + + wait_until(30, 1, evicted) + # it must have uploaded something + uploaded_segs = sk.list_uploaded_segments(tenant_id, timeline_id) + log.info(f"uploaded segments before reset: {uploaded_segs}") + assert len(uploaded_segs) > 0 + + reset_res = http_cli.backup_partial_reset(tenant_id, timeline_id) + log.info(f"reset res: {reset_res}") + + # Backup_partial_reset must have reset the state and dropped s3 segment. + # + # Note: if listing takes more than --partial-backup-timeout test becomes + # flaky because file might be reuploaded. With local fs it shouldn't be an + # issue, but can add retry if this appears. + uploaded_segs = sk.list_uploaded_segments(tenant_id, timeline_id) + log.info(f"uploaded segments after reset: {uploaded_segs}") + assert len(uploaded_segs) == 0 + + # calling second time should be ok + http_cli.backup_partial_reset(tenant_id, timeline_id) + + # inserting data should be ok + endpoint.start() + endpoint.safe_psql("insert into t values(1, 'hehe')") + + def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilder): """ Verify that pulling timeline from a SK with an uploaded partial segment @@ -2357,7 +2443,16 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde "--eviction-min-resident=500ms", ] - env = neon_env_builder.init_start(initial_tenant_conf={"checkpoint_timeout": "100ms"}) + # XXX: pageserver currently connects to safekeeper as long as connection + # manager doesn't remove its entry (default lagging_wal_timeout is 10s), + # causing uneviction. It should be fixed to not reconnect if last + # remote_consistent_lsn is communicated and there is nothing to fetch. Until + # this is fixed make value lower to speed up the test. + initial_tenant_conf = { + "lagging_wal_timeout": "1s", + "checkpoint_timeout": "100ms", + } + env = neon_env_builder.init_start(initial_tenant_conf=initial_tenant_conf) tenant_id = env.initial_tenant timeline_id = env.initial_timeline @@ -2421,7 +2516,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde endpoint.start(safekeepers=[2, 3]) def new_partial_segment_uploaded(): - segs = src_sk.list_uploaded_segments(tenant_id, timeline_id) + segs = dst_sk.list_uploaded_segments(tenant_id, timeline_id) for seg in segs: if "partial" in seg and "sk3" in seg: return seg diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py index 6582b34218..229d3efd8e 100644 --- a/test_runner/regress/test_wal_receiver.py +++ b/test_runner/regress/test_wal_receiver.py @@ -62,6 +62,12 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil elements_to_insert = 1_000_000 expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive" env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*") + # we configure wait_lsn_timeout to a shorter value than the lagging_wal_timeout / walreceiver_connect_timeout + # => after we run into a timeout and reconnect to a different SK, more time than wait_lsn_timeout has passed + # ==> we log this error + env.pageserver.allowed_errors.append( + ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*" + ) insert_test_elements(env, tenant_id, start=0, count=elements_to_insert) diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index b6910406e2..a317b9b5b9 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit b6910406e2d05a2c94baa2e530ec882733047759 +Subproject commit a317b9b5b96978b49e78986697f3dd80d06f99a7 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 76063bff63..49d5e576a5 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 76063bff638ccce7afa99fc9037ac51338b9823d +Subproject commit 49d5e576a56e4cc59cd6a6a0791b2324b9fa675e diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 8efa089aa7..6e9a4ff624 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 8efa089aa7786381543a4f9efc69b92d43eab8c0 +Subproject commit 6e9a4ff6249ac02b8175054b7b3f7dfb198be48b diff --git a/vendor/revisions.json b/vendor/revisions.json index 50cc99c2f1..e52576e61f 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,14 +1,14 @@ { "v16": [ "16.4", - "8efa089aa7786381543a4f9efc69b92d43eab8c0" + "6e9a4ff6249ac02b8175054b7b3f7dfb198be48b" ], "v15": [ "15.8", - "76063bff638ccce7afa99fc9037ac51338b9823d" + "49d5e576a56e4cc59cd6a6a0791b2324b9fa675e" ], "v14": [ "14.13", - "b6910406e2d05a2c94baa2e530ec882733047759" + "a317b9b5b96978b49e78986697f3dd80d06f99a7" ] }