diff --git a/.github/actions/neon-project-create/action.yml b/.github/actions/neon-project-create/action.yml index a5b4104908..d7ff05be1a 100644 --- a/.github/actions/neon-project-create/action.yml +++ b/.github/actions/neon-project-create/action.yml @@ -49,10 +49,6 @@ inputs: description: 'A JSON object with project settings' required: false default: '{}' - default_endpoint_settings: - description: 'A JSON object with the default endpoint settings' - required: false - default: '{}' outputs: dsn: @@ -139,21 +135,6 @@ runs: -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ -d "{\"scheduling\": \"Essential\"}" fi - # XXX - # This is a workaround for the default endpoint settings, which currently do not allow some settings in the public API. - # https://github.com/neondatabase/cloud/issues/27108 - if [[ -n ${DEFAULT_ENDPOINT_SETTINGS} && ${DEFAULT_ENDPOINT_SETTINGS} != "{}" ]] ; then - PROJECT_DATA=$(curl -X GET \ - "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/projects/${project_id}" \ - -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ - -d "{\"scheduling\": \"Essential\"}" - ) - NEW_DEFAULT_ENDPOINT_SETTINGS=$(echo ${PROJECT_DATA} | jq -rc ".project.default_endpoint_settings + ${DEFAULT_ENDPOINT_SETTINGS}") - curl -X POST --fail \ - "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/projects/${project_id}/default_endpoint_settings" \ - -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \ - --data "${NEW_DEFAULT_ENDPOINT_SETTINGS}" - fi env: @@ -171,4 +152,3 @@ runs: PSQL: ${{ inputs.psql_path }} LD_LIBRARY_PATH: ${{ inputs.libpq_lib_path }} PROJECT_SETTINGS: ${{ inputs.project_settings }} - DEFAULT_ENDPOINT_SETTINGS: ${{ inputs.default_endpoint_settings }} diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml index 7cede309f3..663afa2c8b 100644 --- a/.github/workflows/_build-and-test-locally.yml +++ b/.github/workflows/_build-and-test-locally.yml @@ -279,18 +279,14 @@ jobs: # run all non-pageserver tests ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E '!package(pageserver)' - # run pageserver tests with different settings - for get_vectored_concurrent_io in sequential sidecar-task; do - for io_engine in std-fs tokio-epoll-uring ; do - for io_mode in buffered direct direct-rw ; do - NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \ - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \ - NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE=$io_mode \ - ${cov_prefix} \ - cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' - done - done - done + # run pageserver tests + # (When developing new pageserver features gated by config fields, we commonly make the rust + # unit tests sensitive to an environment variable NEON_PAGESERVER_UNIT_TEST_FEATURENAME. + # Then run the nextest invocation below for all relevant combinations. Singling out the + # pageserver tests from non-pageserver tests cuts down the time it takes for this CI step.) + NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=tokio-epoll-uring \ + ${cov_prefix} \ + cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(pageserver)' # Run separate tests for real S3 export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty @@ -405,8 +401,6 @@ jobs: CHECK_ONDISK_DATA_COMPATIBILITY: nonempty BUILD_TAG: ${{ inputs.build-tag }} PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring - PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task - PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }} # Temporary disable this step until we figure out why it's so flaky diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e0995218f9..a887db2ab1 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -323,8 +323,6 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring - PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task - PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw SYNC_BETWEEN_TESTS: true # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones @@ -965,7 +963,7 @@ jobs: fi - name: Verify docker-compose example and test extensions - timeout-minutes: 20 + timeout-minutes: 60 env: TAG: >- ${{ diff --git a/.github/workflows/cloud-extensions.yml b/.github/workflows/cloud-extensions.yml index 4114f0f9b4..25fe0877d9 100644 --- a/.github/workflows/cloud-extensions.yml +++ b/.github/workflows/cloud-extensions.yml @@ -35,7 +35,7 @@ jobs: matrix: pg-version: [16, 17] - runs-on: [ self-hosted, small ] + runs-on: us-east-2 container: # We use the neon-test-extensions image here as it contains the source code for the extensions. image: ghcr.io/neondatabase/neon-test-extensions-v${{ matrix.pg-version }}:latest @@ -71,20 +71,7 @@ jobs: region_id: ${{ inputs.region_id || 'aws-us-east-2' }} postgres_version: ${{ matrix.pg-version }} project_settings: ${{ steps.project-settings.outputs.settings }} - # We need these settings to get the expected output results. - # We cannot use the environment variables e.g. PGTZ due to - # https://github.com/neondatabase/neon/issues/1287 - default_endpoint_settings: > - { - "pg_settings": { - "DateStyle": "Postgres,MDY", - "TimeZone": "America/Los_Angeles", - "compute_query_id": "off", - "neon.allow_unstable_extensions": "on" - } - } api_key: ${{ secrets.NEON_STAGING_API_KEY }} - admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }} - name: Run the regression tests run: /run-tests.sh -r /ext-src diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 9c504eb5bf..3427a0eb49 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -63,8 +63,10 @@ jobs: - name: Filter out only v-string for build matrix id: postgres_changes + env: + CHANGES: ${{ steps.files_changed.outputs.changes }} run: | - v_strings_only_as_json_array=$(echo ${{ steps.files_changed.outputs.chnages }} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c) + v_strings_only_as_json_array=$(echo ${CHANGES} | jq '.[]|select(test("v\\d+"))' | jq --slurp -c) echo "changes=${v_strings_only_as_json_array}" | tee -a "${GITHUB_OUTPUT}" check-macos-build: diff --git a/Cargo.lock b/Cargo.lock index fe4cc35029..d919537818 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1112,6 +1112,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + [[package]] name = "cgroups-rs" version = "0.3.3" @@ -1303,9 +1309,10 @@ dependencies = [ "futures", "http 1.1.0", "indexmap 2.0.1", + "itertools 0.10.5", "jsonwebtoken", "metrics", - "nix 0.27.1", + "nix 0.30.1", "notify", "num_cpus", "once_cell", @@ -1428,7 +1435,7 @@ dependencies = [ "humantime-serde", "hyper 0.14.30", "jsonwebtoken", - "nix 0.27.1", + "nix 0.30.1", "once_cell", "pageserver_api", "pageserver_client", @@ -3511,9 +3518,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.169" +version = "0.2.172" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" [[package]] name = "libloading" @@ -3787,6 +3794,16 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" +[[package]] +name = "neon-shmem" +version = "0.1.0" +dependencies = [ + "nix 0.30.1", + "tempfile", + "thiserror 1.0.69", + "workspace_hack", +] + [[package]] name = "never-say-never" version = "6.6.666" @@ -3820,12 +3837,13 @@ dependencies = [ [[package]] name = "nix" -version = "0.27.1" +version = "0.30.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053" +checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" dependencies = [ "bitflags 2.8.0", "cfg-if", + "cfg_aliases", "libc", "memoffset 0.9.0", ] @@ -4279,7 +4297,7 @@ dependencies = [ "jsonwebtoken", "md5", "metrics", - "nix 0.27.1", + "nix 0.30.1", "num-traits", "num_cpus", "once_cell", @@ -4330,6 +4348,7 @@ dependencies = [ "toml_edit", "tracing", "tracing-utils", + "twox-hash", "url", "utils", "uuid", @@ -4354,7 +4373,7 @@ dependencies = [ "humantime", "humantime-serde", "itertools 0.10.5", - "nix 0.27.1", + "nix 0.30.1", "once_cell", "postgres_backend", "postgres_ffi", @@ -4415,6 +4434,16 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "pageserver_page_api" +version = "0.1.0" +dependencies = [ + "prost 0.13.3", + "tonic", + "tonic-build", + "workspace_hack", +] + [[package]] name = "papaya" version = "0.2.1" @@ -4847,6 +4876,19 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "posthog_client_lite" +version = "0.1.0" +dependencies = [ + "anyhow", + "reqwest", + "serde", + "serde_json", + "sha2", + "thiserror 1.0.69", + "workspace_hack", +] + [[package]] name = "powerfmt" version = "0.2.0" @@ -7884,7 +7926,7 @@ dependencies = [ "humantime", "jsonwebtoken", "metrics", - "nix 0.27.1", + "nix 0.30.1", "once_cell", "pem", "pin-project-lite", @@ -8438,8 +8480,10 @@ dependencies = [ "fail", "form_urlencoded", "futures-channel", + "futures-core", "futures-executor", "futures-io", + "futures-task", "futures-util", "generic-array", "getrandom 0.2.11", @@ -8458,6 +8502,7 @@ dependencies = [ "log", "memchr", "nix 0.26.4", + "nix 0.30.1", "nom", "num", "num-bigint", @@ -8469,6 +8514,7 @@ dependencies = [ "once_cell", "p256 0.13.2", "parquet", + "percent-encoding", "prettyplease", "proc-macro2", "prost 0.13.3", diff --git a/Cargo.toml b/Cargo.toml index 8d4cc4a75a..a280c446b9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,7 @@ members = [ "pageserver/ctl", "pageserver/client", "pageserver/pagebench", + "pageserver/page_api", "proxy", "safekeeper", "safekeeper/client", @@ -23,9 +24,11 @@ members = [ "libs/postgres_ffi", "libs/safekeeper_api", "libs/desim", + "libs/neon-shmem", "libs/utils", "libs/consumption_metrics", "libs/postgres_backend", + "libs/posthog_client_lite", "libs/pq_proto", "libs/tenant_size_model", "libs/metrics", @@ -126,7 +129,7 @@ md5 = "0.7.0" measured = { version = "0.0.22", features=["lasso"] } measured-process = { version = "0.0.22" } memoffset = "0.9" -nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal", "poll"] } +nix = { version = "0.30.1", features = ["dir", "fs", "mman", "process", "socket", "signal", "poll"] } # Do not update to >= 7.0.0, at least. The update will have a significant impact # on compute startup metrics (start_postgres_ms), >= 25% degradation. notify = "6.0.0" @@ -250,6 +253,7 @@ pageserver = { path = "./pageserver" } pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" } pageserver_client = { path = "./pageserver/client" } pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" } +pageserver_page_api = { path = "./pageserver/page_api" } postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" } postgres_connection = { version = "0.1", path = "./libs/postgres_connection/" } postgres_ffi = { version = "0.1", path = "./libs/postgres_ffi/" } diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile index f63d844afd..1933fd19d8 100644 --- a/build-tools.Dockerfile +++ b/build-tools.Dockerfile @@ -292,7 +292,7 @@ WORKDIR /home/nonroot # Rust # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`) -ENV RUSTC_VERSION=1.86.0 +ENV RUSTC_VERSION=1.87.0 ENV RUSTUP_HOME="/home/nonroot/.rustup" ENV PATH="/home/nonroot/.cargo/bin:${PATH}" ARG RUSTFILT_VERSION=0.2.1 diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 6233eaf709..17e50697db 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -1117,8 +1117,8 @@ RUN wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar. mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \ echo "#nothing to test here" > neon-test.sh -RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.1.tar.gz -O pgrag.tar.gz && \ - echo "087b2ecd11ba307dc968042ef2e9e43dc04d9ba60e8306e882c407bbe1350a50 pgrag.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.1.2.tar.gz -O pgrag.tar.gz && \ + echo "7361654ea24f08cbb9db13c2ee1c0fe008f6114076401bb871619690dafc5225 pgrag.tar.gz" | sha256sum --check && \ mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . FROM rust-extensions-build-pgrx14 AS pgrag-build @@ -1971,7 +1971,8 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml # Make the libraries we built available -RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig +COPY --chmod=0666 compute/etc/ld.so.conf.d/00-neon.conf /etc/ld.so.conf.d/00-neon.conf +RUN /sbin/ldconfig # rsyslog config permissions # directory for rsyslogd pid file diff --git a/compute/etc/ld.so.conf.d/00-neon.conf b/compute/etc/ld.so.conf.d/00-neon.conf new file mode 100644 index 0000000000..e8e4bdcd42 --- /dev/null +++ b/compute/etc/ld.so.conf.d/00-neon.conf @@ -0,0 +1 @@ +/usr/local/lib diff --git a/compute/patches/rum.patch b/compute/patches/rum.patch index b45afe2874..aed1badc13 100644 --- a/compute/patches/rum.patch +++ b/compute/patches/rum.patch @@ -7,7 +7,7 @@ index 255e616..1c6edb7 100644 RelationGetRelationName(index)); +#ifdef NEON_SMGR -+ smgr_start_unlogged_build(index->rd_smgr); ++ smgr_start_unlogged_build(RelationGetSmgr(index)); +#endif + initRumState(&buildstate.rumstate, index); @@ -18,7 +18,7 @@ index 255e616..1c6edb7 100644 rumUpdateStats(index, &buildstate.buildStats, buildstate.rumstate.isBuild); +#ifdef NEON_SMGR -+ smgr_finish_unlogged_build_phase_1(index->rd_smgr); ++ smgr_finish_unlogged_build_phase_1(RelationGetSmgr(index)); +#endif + /* @@ -29,7 +29,7 @@ index 255e616..1c6edb7 100644 } +#ifdef NEON_SMGR -+ smgr_end_unlogged_build(index->rd_smgr); ++ smgr_end_unlogged_build(RelationGetSmgr(index)); +#endif + /* diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index 8ee5dd0665..f9da3ba700 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -28,6 +28,7 @@ flate2.workspace = true futures.workspace = true http.workspace = true indexmap.workspace = true +itertools.workspace = true jsonwebtoken.workspace = true metrics.workspace = true nix.workspace = true diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index e337ee7b15..20b5e567a8 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -60,12 +60,16 @@ use utils::failpoint_support; // Compatibility hack: if the control plane specified any remote-ext-config // use the default value for extension storage proxy gateway. // Remove this once the control plane is updated to pass the gateway URL -fn parse_remote_ext_config(arg: &str) -> Result { - if arg.starts_with("http") { - Ok(arg.trim_end_matches('/').to_string()) +fn parse_remote_ext_base_url(arg: &str) -> Result { + const FALLBACK_PG_EXT_GATEWAY_BASE_URL: &str = + "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local"; + + Ok(if arg.starts_with("http") { + arg } else { - Ok("http://pg-ext-s3-gateway".to_string()) + FALLBACK_PG_EXT_GATEWAY_BASE_URL } + .to_owned()) } #[derive(Parser)] @@ -74,8 +78,10 @@ struct Cli { #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")] pub pgbin: String, - #[arg(short = 'r', long, value_parser = parse_remote_ext_config)] - pub remote_ext_config: Option, + /// The base URL for the remote extension storage proxy gateway. + /// Should be in the form of `http(s)://[:]`. + #[arg(short = 'r', long, value_parser = parse_remote_ext_base_url, alias = "remote-ext-config")] + pub remote_ext_base_url: Option, /// The port to bind the external listening HTTP server to. Clients running /// outside the compute will talk to the compute through this port. Keep @@ -164,7 +170,7 @@ fn main() -> Result<()> { pgversion: get_pg_version_string(&cli.pgbin), external_http_port: cli.external_http_port, internal_http_port: cli.internal_http_port, - ext_remote_storage: cli.remote_ext_config.clone(), + remote_ext_base_url: cli.remote_ext_base_url.clone(), resize_swap_on_bind: cli.resize_swap_on_bind, set_disk_quota_for_fs: cli.set_disk_quota_for_fs, #[cfg(target_os = "linux")] @@ -265,4 +271,18 @@ mod test { fn verify_cli() { Cli::command().debug_assert() } + + #[test] + fn parse_pg_ext_gateway_base_url() { + let arg = "http://pg-ext-s3-gateway2"; + let result = super::parse_remote_ext_base_url(arg).unwrap(); + assert_eq!(result, arg); + + let arg = "pg-ext-s3-gateway"; + let result = super::parse_remote_ext_base_url(arg).unwrap(); + assert_eq!( + result, + "http://pg-ext-s3-gateway.pg-ext-s3-gateway.svc.cluster.local" + ); + } } diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs index 537028cde1..78acd78585 100644 --- a/compute_tools/src/bin/fast_import.rs +++ b/compute_tools/src/bin/fast_import.rs @@ -348,6 +348,7 @@ async fn run_dump_restore( "--no-security-labels".to_string(), "--no-subscriptions".to_string(), "--no-tablespaces".to_string(), + "--no-event-triggers".to_string(), // format "--format".to_string(), "directory".to_string(), diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 0cda36a6e2..f494e2444a 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -11,6 +11,7 @@ use compute_api::spec::{ use futures::StreamExt; use futures::future::join_all; use futures::stream::FuturesUnordered; +use itertools::Itertools; use nix::sys::signal::{Signal, kill}; use nix::unistd::Pid; use once_cell::sync::Lazy; @@ -18,7 +19,7 @@ use postgres; use postgres::NoTls; use postgres::error::SqlState; use remote_storage::{DownloadError, RemotePath}; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::net::SocketAddr; use std::os::unix::fs::{PermissionsExt, symlink}; use std::path::Path; @@ -95,7 +96,7 @@ pub struct ComputeNodeParams { pub internal_http_port: u16, /// the address of extension storage proxy gateway - pub ext_remote_storage: Option, + pub remote_ext_base_url: Option, } /// Compute node info shared across several `compute_ctl` threads. @@ -1896,9 +1897,9 @@ LIMIT 100", real_ext_name: String, ext_path: RemotePath, ) -> Result { - let ext_remote_storage = + let remote_ext_base_url = self.params - .ext_remote_storage + .remote_ext_base_url .as_ref() .ok_or(DownloadError::BadInput(anyhow::anyhow!( "Remote extensions storage is not configured", @@ -1960,7 +1961,7 @@ LIMIT 100", let download_size = extension_server::download_extension( &real_ext_name, &ext_path, - ext_remote_storage, + remote_ext_base_url, &self.params.pgbin, ) .await @@ -1995,23 +1996,40 @@ LIMIT 100", tokio::spawn(conn); // TODO: support other types of grants apart from schemas? - let query = format!( - "GRANT {} ON SCHEMA {} TO {}", - privileges - .iter() - // should not be quoted as it's part of the command. - // is already sanitized so it's ok - .map(|p| p.as_str()) - .collect::>() - .join(", "), - // quote the schema and role name as identifiers to sanitize them. - schema_name.pg_quote(), - role_name.pg_quote(), - ); - db_client - .simple_query(&query) + + // check the role grants first - to gracefully handle read-replicas. + let select = "SELECT privilege_type + FROM pg_namespace + JOIN LATERAL (SELECT * FROM aclexplode(nspacl) AS x) acl ON true + JOIN pg_user users ON acl.grantee = users.usesysid + WHERE users.usename = $1 + AND nspname = $2"; + let rows = db_client + .query(select, &[role_name, schema_name]) .await - .with_context(|| format!("Failed to execute query: {}", query))?; + .with_context(|| format!("Failed to execute query: {select}"))?; + + let already_granted: HashSet = rows.into_iter().map(|row| row.get(0)).collect(); + + let grants = privileges + .iter() + .filter(|p| !already_granted.contains(p.as_str())) + // should not be quoted as it's part of the command. + // is already sanitized so it's ok + .map(|p| p.as_str()) + .join(", "); + + if !grants.is_empty() { + // quote the schema and role name as identifiers to sanitize them. + let schema_name = schema_name.pg_quote(); + let role_name = role_name.pg_quote(); + + let query = format!("GRANT {grants} ON SCHEMA {schema_name} TO {role_name}",); + db_client + .simple_query(&query) + .await + .with_context(|| format!("Failed to execute query: {}", query))?; + } Ok(()) } @@ -2069,7 +2087,7 @@ LIMIT 100", &self, spec: &ComputeSpec, ) -> Result { - if self.params.ext_remote_storage.is_none() { + if self.params.remote_ext_base_url.is_none() { return Ok(RemoteExtensionMetrics { num_ext_downloaded: 0, largest_ext_size: 0, diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 42d245f55a..933b30134f 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -224,7 +224,10 @@ pub fn write_postgres_conf( writeln!(file, "pgaudit.log_rotation_age=5")?; // Enable audit logs for pg_session_jwt extension - writeln!(file, "pg_session_jwt.audit_log=on")?; + // TODO: Consider a good approach for shipping pg_session_jwt logs to the same sink as + // pgAudit - additional context in https://github.com/neondatabase/cloud/issues/28863 + // + // writeln!(file, "pg_session_jwt.audit_log=on")?; // Add audit shared_preload_libraries, if they are not present. // diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs index ee889e0c40..3439383699 100644 --- a/compute_tools/src/extension_server.rs +++ b/compute_tools/src/extension_server.rs @@ -158,14 +158,14 @@ fn parse_pg_version(human_version: &str) -> PostgresMajorVersion { pub async fn download_extension( ext_name: &str, ext_path: &RemotePath, - ext_remote_storage: &str, + remote_ext_base_url: &str, pgbin: &str, ) -> Result { info!("Download extension {:?} from {:?}", ext_name, ext_path); // TODO add retry logic let download_buffer = - match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await { + match download_extension_tar(remote_ext_base_url, &ext_path.to_string()).await { Ok(buffer) => buffer, Err(error_message) => { return Err(anyhow::anyhow!( @@ -272,8 +272,8 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) { // Do request to extension storage proxy, e.g., // curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst // using HTTP GET and return the response body as bytes. -async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result { - let uri = format!("{}/{}", ext_remote_storage, ext_path); +async fn download_extension_tar(remote_ext_base_url: &str, ext_path: &str) -> Result { + let uri = format!("{}/{}", remote_ext_base_url, ext_path); let filename = Path::new(ext_path) .file_name() .unwrap_or_else(|| std::ffi::OsStr::new("unknown")) diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs index 6508de6eee..e141a48b7f 100644 --- a/compute_tools/src/http/routes/extension_server.rs +++ b/compute_tools/src/http/routes/extension_server.rs @@ -22,7 +22,7 @@ pub(in crate::http) async fn download_extension( State(compute): State>, ) -> Response { // Don't even try to download extensions if no remote storage is configured - if compute.params.ext_remote_storage.is_none() { + if compute.params.remote_ext_base_url.is_none() { return JsonResponse::error( StatusCode::PRECONDITION_FAILED, "remote storage is not configured", diff --git a/control_plane/src/background_process.rs b/control_plane/src/background_process.rs index 1eac4f7ff0..4f0934e411 100644 --- a/control_plane/src/background_process.rs +++ b/control_plane/src/background_process.rs @@ -14,7 +14,7 @@ use std::ffi::OsStr; use std::io::Write; -use std::os::unix::prelude::AsRawFd; +use std::os::fd::AsFd; use std::os::unix::process::CommandExt; use std::path::Path; use std::process::Command; @@ -356,7 +356,7 @@ where let file = pid_file::claim_for_current_process(&path).expect("claim pid file"); // Remove the FD_CLOEXEC flag on the pidfile descriptor so that the pidfile // remains locked after exec. - nix::fcntl::fcntl(file.as_raw_fd(), FcntlArg::F_SETFD(FdFlag::empty())) + nix::fcntl::fcntl(file.as_fd(), FcntlArg::F_SETFD(FdFlag::empty())) .expect("remove FD_CLOEXEC"); // Don't run drop(file), it would close the file before we actually exec. std::mem::forget(file); diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index fd625e9ed6..98ab6e5657 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -8,7 +8,6 @@ use std::borrow::Cow; use std::collections::{BTreeSet, HashMap}; use std::fs::File; -use std::os::fd::AsRawFd; use std::path::PathBuf; use std::process::exit; use std::str::FromStr; @@ -31,7 +30,7 @@ use control_plane::safekeeper::SafekeeperNode; use control_plane::storage_controller::{ NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController, }; -use nix::fcntl::{FlockArg, flock}; +use nix::fcntl::{Flock, FlockArg}; use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, @@ -644,9 +643,10 @@ struct EndpointStartCmdArgs { #[clap( long, - help = "Configure the remote extensions storage proxy gateway to request for extensions." + help = "Configure the remote extensions storage proxy gateway URL to request for extensions.", + alias = "remote-ext-config" )] - remote_ext_config: Option, + remote_ext_base_url: Option, #[clap( long, @@ -748,16 +748,16 @@ struct TimelineTreeEl { /// A flock-based guard over the neon_local repository directory struct RepoLock { - _file: File, + _file: Flock, } impl RepoLock { fn new() -> Result { let repo_dir = File::open(local_env::base_path())?; - let repo_dir_fd = repo_dir.as_raw_fd(); - flock(repo_dir_fd, FlockArg::LockExclusive)?; - - Ok(Self { _file: repo_dir }) + match Flock::lock(repo_dir, FlockArg::LockExclusive) { + Ok(f) => Ok(Self { _file: f }), + Err((_, e)) => Err(e).context("flock error"), + } } } @@ -1414,9 +1414,16 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res EndpointCmd::Start(args) => { let endpoint_id = &args.endpoint_id; let pageserver_id = args.endpoint_pageserver_id; - let remote_ext_config = &args.remote_ext_config; + let remote_ext_base_url = &args.remote_ext_base_url; - let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new); + let default_generation = env + .storage_controller + .timelines_onto_safekeepers + .then_some(1); + let safekeepers_generation = args + .safekeepers_generation + .or(default_generation) + .map(SafekeeperGeneration::new); // If --safekeepers argument is given, use only the listed // safekeeper nodes; otherwise all from the env. let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? { @@ -1510,7 +1517,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res safekeepers_generation, safekeepers, pageservers, - remote_ext_config.as_ref(), + remote_ext_base_url.as_ref(), stripe_size.0 as usize, args.create_test_user, args.start_timeout, diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index be73661a3c..708745446d 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -655,7 +655,7 @@ impl Endpoint { safekeepers_generation: Option, safekeepers: Vec, pageservers: Vec<(Host, u16)>, - remote_ext_config: Option<&String>, + remote_ext_base_url: Option<&String>, shard_stripe_size: usize, create_test_user: bool, start_timeout: Duration, @@ -825,8 +825,8 @@ impl Endpoint { .stderr(logfile.try_clone()?) .stdout(logfile); - if let Some(remote_ext_config) = remote_ext_config { - cmd.args(["--remote-ext-config", remote_ext_config]); + if let Some(remote_ext_base_url) = remote_ext_base_url { + cmd.args(["--remote-ext-base-url", remote_ext_base_url]); } let child = cmd.spawn()?; diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index a36815d27e..755d67a7ad 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -10,7 +10,8 @@ use camino::{Utf8Path, Utf8PathBuf}; use hyper0::Uri; use nix::unistd::Pid; use pageserver_api::controller_api::{ - NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, TenantCreateRequest, + NodeConfigureRequest, NodeDescribeResponse, NodeRegisterRequest, + SafekeeperSchedulingPolicyRequest, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, TenantLocateResponse, }; use pageserver_api::models::{ @@ -20,7 +21,7 @@ use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api::ResponseErrorMessageExt; use pem::Pem; use postgres_backend::AuthType; -use reqwest::Method; +use reqwest::{Method, Response}; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use tokio::process::Command; @@ -570,6 +571,11 @@ impl StorageController { let peer_jwt_token = encode_from_key_file(&peer_claims, private_key) .expect("failed to generate jwt token"); args.push(format!("--peer-jwt-token={peer_jwt_token}")); + + let claims = Claims::new(None, Scope::SafekeeperData); + let jwt_token = + encode_from_key_file(&claims, private_key).expect("failed to generate jwt token"); + args.push(format!("--safekeeper-jwt-token={jwt_token}")); } if let Some(public_key) = &self.public_key { @@ -614,6 +620,10 @@ impl StorageController { self.env.base_data_dir.display() )); + if self.env.safekeepers.iter().any(|sk| sk.auth_enabled) && self.private_key.is_none() { + anyhow::bail!("Safekeeper set up for auth but no private key specified"); + } + if self.config.timelines_onto_safekeepers { args.push("--timelines-onto-safekeepers".to_string()); } @@ -640,6 +650,10 @@ impl StorageController { ) .await?; + if self.config.timelines_onto_safekeepers { + self.register_safekeepers().await?; + } + Ok(()) } @@ -743,6 +757,23 @@ impl StorageController { where RQ: Serialize + Sized, RS: DeserializeOwned + Sized, + { + let response = self.dispatch_inner(method, path, body).await?; + Ok(response + .json() + .await + .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?) + } + + /// Simple HTTP request wrapper for calling into storage controller + async fn dispatch_inner( + &self, + method: reqwest::Method, + path: String, + body: Option, + ) -> anyhow::Result + where + RQ: Serialize + Sized, { // In the special case of the `storage_controller start` subcommand, we wish // to use the API endpoint of the newly started storage controller in order @@ -785,10 +816,31 @@ impl StorageController { let response = builder.send().await?; let response = response.error_from_body().await?; - Ok(response - .json() - .await - .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?) + Ok(response) + } + + /// Register the safekeepers in the storage controller + #[instrument(skip(self))] + async fn register_safekeepers(&self) -> anyhow::Result<()> { + for sk in self.env.safekeepers.iter() { + let sk_id = sk.id; + let body = serde_json::json!({ + "id": sk_id, + "created_at": "2023-10-25T09:11:25Z", + "updated_at": "2024-08-28T11:32:43Z", + "region_id": "aws-us-east-2", + "host": "127.0.0.1", + "port": sk.pg_port, + "http_port": sk.http_port, + "https_port": sk.https_port, + "version": 5957, + "availability_zone_id": format!("us-east-2b-{sk_id}"), + }); + self.upsert_safekeeper(sk_id, body).await?; + self.safekeeper_scheduling_policy(sk_id, SkSchedulingPolicy::Active) + .await?; + } + Ok(()) } /// Call into the attach_hook API, for use before handing out attachments to pageservers @@ -816,6 +868,42 @@ impl StorageController { Ok(response.generation) } + #[instrument(skip(self))] + pub async fn upsert_safekeeper( + &self, + node_id: NodeId, + request: serde_json::Value, + ) -> anyhow::Result<()> { + let resp = self + .dispatch_inner::( + Method::POST, + format!("control/v1/safekeeper/{node_id}"), + Some(request), + ) + .await?; + if !resp.status().is_success() { + anyhow::bail!( + "setting scheduling policy unsuccessful for safekeeper {node_id}: {}", + resp.status() + ); + } + Ok(()) + } + + #[instrument(skip(self))] + pub async fn safekeeper_scheduling_policy( + &self, + node_id: NodeId, + scheduling_policy: SkSchedulingPolicy, + ) -> anyhow::Result<()> { + self.dispatch::( + Method::POST, + format!("control/v1/safekeeper/{node_id}/scheduling_policy"), + Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }), + ) + .await + } + #[instrument(skip(self))] pub async fn inspect( &self, diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh index 723b2f8afb..20a1ffb7a0 100755 --- a/docker-compose/compute_wrapper/shell/compute.sh +++ b/docker-compose/compute_wrapper/shell/compute.sh @@ -14,6 +14,14 @@ PG_VERSION=${PG_VERSION:-14} CONFIG_FILE_ORG=/var/db/postgres/configs/config.json CONFIG_FILE=/tmp/config.json +# Test that the first library path that the dynamic loader looks in is the path +# that we use for custom compiled software +first_path="$(ldconfig --verbose 2>/dev/null \ + | grep --invert-match ^$'\t' \ + | cut --delimiter=: --fields=1 \ + | head --lines=1)" +test "$first_path" == '/usr/local/lib' || true # Remove the || true in a follow-up PR. Needed for backwards compat. + echo "Waiting pageserver become ready." while ! nc -z pageserver 6400; do sleep 1; diff --git a/docker-compose/ext-src/alter_db.sh b/docker-compose/ext-src/alter_db.sh new file mode 100755 index 0000000000..6df37e1c9b --- /dev/null +++ b/docker-compose/ext-src/alter_db.sh @@ -0,0 +1,8 @@ +#!/bin/bash +# We need these settings to get the expected output results. +# We cannot use the environment variables e.g. PGTZ due to +# https://github.com/neondatabase/neon/issues/1287 +export DATABASE=${1:-contrib_regression} +psql -c "ALTER DATABASE ${DATABASE} SET neon.allow_unstable_extensions='on'" \ + -c "ALTER DATABASE ${DATABASE} SET DateStyle='Postgres,MDY'" \ + -c "ALTER DATABASE ${DATABASE} SET TimeZone='America/Los_Angeles'" \ diff --git a/docker-compose/ext-src/pg_graphql-src/regular-test.sh b/docker-compose/ext-src/pg_graphql-src/regular-test.sh index 85e1ae057a..9e7d63b612 100755 --- a/docker-compose/ext-src/pg_graphql-src/regular-test.sh +++ b/docker-compose/ext-src/pg_graphql-src/regular-test.sh @@ -18,6 +18,7 @@ TESTS=${TESTS/row_level_security/} TESTS=${TESTS/sqli_connection/} dropdb --if-exist contrib_regression createdb contrib_regression +. ../alter_db.sh psql -v ON_ERROR_STOP=1 -f test/fixtures.sql -d contrib_regression ${REGRESS} --use-existing --dbname=contrib_regression --inputdir=${TESTDIR} ${TESTS} diff --git a/docker-compose/ext-src/pgrag-src/regular-test.sh b/docker-compose/ext-src/pgrag-src/regular-test.sh index 6cb1b049a4..22eb7498fd 100755 --- a/docker-compose/ext-src/pgrag-src/regular-test.sh +++ b/docker-compose/ext-src/pgrag-src/regular-test.sh @@ -3,6 +3,7 @@ set -ex cd "$(dirname "${0}")" dropdb --if-exist contrib_regression createdb contrib_regression +. ../alter_db.sh psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag" PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress ${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --load-extension=vector --load-extension=rag --dbname=contrib_regression basic_functions text_processing api_keys chunking_functions document_processing embedding_api_functions voyageai_functions diff --git a/docker-compose/ext-src/pgx_ulid-src/Makefile b/docker-compose/ext-src/pgx_ulid-src/Makefile index 6480c48441..00975e8c48 100644 --- a/docker-compose/ext-src/pgx_ulid-src/Makefile +++ b/docker-compose/ext-src/pgx_ulid-src/Makefile @@ -20,5 +20,6 @@ installcheck: regression-test regression-test: dropdb --if-exists contrib_regression createdb contrib_regression + ../alter_db.sh psql -d contrib_regression -c "CREATE EXTENSION $(EXTNAME)" $(PG_REGRESS) --inputdir=. --outputdir=. --use-existing --dbname=contrib_regression $(REGRESS) diff --git a/docker-compose/ext-src/plv8-src/regular-test.sh b/docker-compose/ext-src/plv8-src/regular-test.sh index b10cc65e8a..d5224e341c 100755 --- a/docker-compose/ext-src/plv8-src/regular-test.sh +++ b/docker-compose/ext-src/plv8-src/regular-test.sh @@ -3,6 +3,7 @@ set -ex cd "$(dirname ${0})" dropdb --if-exist contrib_regression createdb contrib_regression +. ../alter_db.sh PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress REGRESS="$(make -n installcheck | awk '{print substr($0,index($0,"init-extension"));}')" REGRESS="${REGRESS/startup_perms/}" diff --git a/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile b/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile index ac87cc511b..de6bdd06c0 100644 --- a/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile +++ b/docker-compose/ext-src/rag_bge_small_en_v15-src/Makefile @@ -11,5 +11,6 @@ PG_REGRESS := $(dir $(PGXS))../../src/test/regress/pg_regress installcheck: dropdb --if-exists contrib_regression createdb contrib_regression + ../alter_db.sh psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag_bge_small_en_v15" $(PG_REGRESS) --use-existing --dbname=contrib_regression $(REGRESS) \ No newline at end of file diff --git a/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile index e81f94ef47..7adcad32f7 100644 --- a/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile +++ b/docker-compose/ext-src/rag_jina_reranker_v1_tiny_en-src/Makefile @@ -11,5 +11,6 @@ PG_REGRESS := $(dir $(PGXS))../../src/test/regress/pg_regress installcheck: dropdb --if-exists contrib_regression createdb contrib_regression + ../alter_db.sh psql -d contrib_regression -c "CREATE EXTENSION vector" -c "CREATE EXTENSION rag_jina_reranker_v1_tiny_en" $(PG_REGRESS) --use-existing --dbname=contrib_regression $(REGRESS) diff --git a/docker-compose/ext-src/rum-src/regular-test.sh b/docker-compose/ext-src/rum-src/regular-test.sh index d1d45a36ef..815c1adb53 100755 --- a/docker-compose/ext-src/rum-src/regular-test.sh +++ b/docker-compose/ext-src/rum-src/regular-test.sh @@ -3,5 +3,6 @@ set -ex cd "$(dirname ${0})" dropdb --if-exist contrib_regression createdb contrib_regression +. ../alter_db.sh PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress ${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin' --use-existing --dbname=contrib_regression rum rum_hash ruminv timestamp orderby orderby_hash altorder altorder_hash limits int2 int4 int8 float4 float8 money oid time timetz date interval macaddr inet cidr text varchar char bytea bit varbit numeric rum_weight expr array \ No newline at end of file diff --git a/docker-compose/pageserver_config/pageserver.toml b/docker-compose/pageserver_config/pageserver.toml index 7d603b6c65..81445ed412 100644 --- a/docker-compose/pageserver_config/pageserver.toml +++ b/docker-compose/pageserver_config/pageserver.toml @@ -5,3 +5,4 @@ listen_http_addr='0.0.0.0:9898' remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' } control_plane_api='http://0.0.0.0:6666' # No storage controller in docker compose, specify a junk address control_plane_emergency_mode=true +virtual_file_io_mode="buffered" # the CI runners where we run the docker compose tests have slow disks diff --git a/docs/rfcs/030-vectored-timeline-get.md b/docs/rfcs/030-vectored-timeline-get.md index 093a964f38..e933eac5fe 100644 --- a/docs/rfcs/030-vectored-timeline-get.md +++ b/docs/rfcs/030-vectored-timeline-get.md @@ -7,6 +7,8 @@ Author: Christian Schwarz A brief RFC / GitHub Epic describing a vectored version of the `Timeline::get` method that is at the heart of Pageserver. +**EDIT**: the implementation of this feature is described in [Vlad's (internal) tech talk](https://drive.google.com/file/d/1vfY24S869UP8lEUUDHRWKF1AJn8fpWoJ/view?usp=drive_link). + # Motivation During basebackup, we issue many `Timeline::get` calls for SLRU pages that are *adjacent* in key space. diff --git a/docs/rfcs/043-bottom-most-gc-compaction.md b/docs/rfcs/043-bottom-most-gc-compaction.md new file mode 100644 index 0000000000..4bba758b31 --- /dev/null +++ b/docs/rfcs/043-bottom-most-gc-compaction.md @@ -0,0 +1,194 @@ +# Bottommost Garbage-Collection Compaction + +## Summary + +The goal of this doc is to propose a way to reliably collect garbages below the GC horizon. This process is called bottom-most garbage-collect-compaction, and is part of the broader legacy-enhanced compaction that we plan to implement in the future. + +## Motivation + +The current GC algorithm will wait until the covering via image layers before collecting the garbages of a key region. Relying on image layer generation to generate covering images is not reliable. There are prior arts to generate feedbacks from the GC algorithm to the image generation process to accelerate garbage collection, but it slows down the system and creates write amplification. + +# Basic Idea + +![](images/036-bottom-most-gc-compaction/01-basic-idea.svg) + +The idea of bottom-most compaction is simple: we rewrite all layers that are below or intersect with the GC horizon to produce a flat level of image layers at the GC horizon and deltas above the GC horizon. In this process, + +- All images and deltas ≤ GC horizon LSN will be dropped. This process collects garbages. +- We produce images for all keys involved in the compaction process at the GC horizon. + +Therefore, it can precisely collect all garbages below the horizon, and reduce the space amplification, i.e., in the staircase pattern (test_gc_feedback). + +![The staircase pattern in test_gc_feedback in the original compaction algorithm. The goal is to collect garbage below the red horizontal line.](images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png) + +The staircase pattern in test_gc_feedback in the original compaction algorithm. The goal is to collect garbage below the red horizontal line. + +# Branches + +With branches, the bottom-most compaction should retain a snapshot of the keyspace at the `retain_lsn` so that the child branch can access data at the branch point. This requires some modifications to the basic bottom-most compaction algorithm that we sketched above. + +![](images/036-bottom-most-gc-compaction/03-retain-lsn.svg) + +## Single Timeline w/ Snapshots: handle `retain_lsn` + +First let’s look into the case where we create branches over the main branch but don’t write any data to them (aka “snapshots”). + +The bottom-most compaction algorithm collects all deltas and images of a key and can make decisions on what data to retain. Given that we have a single key’s history as below: + +``` +LSN 0x10 -> A +LSN 0x20 -> append B +retain_lsn: 0x20 +LSN 0x30 -> append C +LSN 0x40 -> append D +retain_lsn: 0x40 +LSN 0x50 -> append E +GC horizon: 0x50 +LSN 0x60 -> append F +``` + +The algorithm will produce: + +``` +LSN 0x20 -> AB +(drop all history below the earliest retain_lsn) +LSN 0x40 -> ABCD +(assume the cost of replaying 2 deltas is higher than storing the full image, we generate an image here) +LSN 0x50 -> append E +(replay one delta is cheap) +LSN 0x60 -> append F +(keep everything as-is above the GC horizon) +``` + +![](images/036-bottom-most-gc-compaction/05-btmgc-parent.svg) + +What happens is that we balance the space taken by each retain_lsn and the cost of replaying deltas during the bottom-most compaction process. This is controlled by a threshold. If `count(deltas) < $threshold`, the deltas will be retained. Otherwise, an image will be generated and the deltas will be dropped. + +In the example above, the `$threshold` is 2. + +## Child Branches with data: pull + partial images + +In the previous section we have shown how bottom-most compaction respects `retain_lsn` so that all data that was readable at branch creation remains readable. But branches can have data on their own, and that data can fall out of the branch’s PITR window. So, this section explains how we deal with that. + +We will run the same bottom-most compaction for these branches, to ensure the space amplification on the child branch is reasonable. + +``` +branch_lsn: 0x20 +LSN 0x30 -> append P +LSN 0x40 -> append Q +LSN 0x50 -> append R +GC horizon: 0x50 +LSN 0x60 -> append S +``` + +Note that bottom-most compaction happens on a per-timeline basis. When it processes this key, it only reads the history from LSN 0x30 without a base image. Therefore, on child branches, the bottom-most compaction process will make image creation decisions based on the same `count(deltas) < $threshold` criteria, and if it decides to create an image, the base image will be retrieved from the ancestor branch. + +``` +branch_lsn: 0x20 +LSN 0x50 -> ABPQR +(we pull the image at LSN 0x20 from the ancestor branch to get AB, and then apply append PQ to the page; we replace the record at 0x40 with an image and drop the delta) +GC horizon: 0x50 +LSN 0x60 -> append S +``` + +![](images/036-bottom-most-gc-compaction/06-btmgc-child.svg) + +Note that for child branches, we do not create image layers for the images when bottom-most compaction runs. Instead, we drop the 0x30/0x40/0x50 delta records and directly place the image ABPQR@0x50 into the delta layer, which serves as a sparse image layer. For child branches, if we create image layers, we will need to put all keys in the range into the image layer. This causes space bloat and slow compactions. In this proposal, the compaction process will only compact and process keys modified inside the child branch. + +# Result + +Bottom-most compaction ensures all garbage under the GC horizon gets collected right away (compared with “eventually” in the current algorithm). Meanwhile, it generates images at each of the retain_lsn to ensure branch reads are fast. As we make per-key decisions on whether to generate an image or not, the theoretical lower bound of the storage space we need to retain for a branch is lower than before. + +Before: min(sum(logs for each key), sum(image for each key)), for each partition — we always generate image layers on a key range + +After: sum(min(logs for each key, image for each key)) + +# Compaction Trigger + +The bottom-most compaction can be automatically triggered. The goal of the trigger is that it should ensure a constant factor for write amplification. Say that the user write 1GB of WAL into the system, we should write 1GB x C data to S3. The legacy compaction algorithm does not have such a constant factor C. The data we write to S3 is quadratic to the logical size of the database (see [A Theoretical View of Neon Storage](https://www.notion.so/A-Theoretical-View-of-Neon-Storage-8d7ad7555b0c41b2a3597fa780911194?pvs=21)). + +We propose the following compaction trigger that generates a constant write amplification factor. Write amplification >= total writes to S3 / total user writes. We only analyze the write amplification caused by the bottom-most GC-compaction process, ignoring the legacy create image layers amplification. + +Given that we have ***X*** bytes of the delta layers above the GC horizon, ***A*** bytes of the delta layers intersecting with the GC horizon, ***B*** bytes of the delta layers below the GC horizon, and ***C*** bytes of the image layers below the GC horizon. + +The legacy GC + compaction loop will always keep ***A*** unchanged, reduce ***B and C*** when there are image layers covering the key range. This yields 0 write amplification (only file deletions) and extra ***B*** bytes of space. + +![](images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg) + +The bottom-most compaction proposed here will split ***A*** into deltas above the GC horizon and below the GC horizon. Everything below the GC horizon will be image layers after the compaction (not considering branches). Therefore, this yields ***A+C*** extra write traffic each iteration, plus 0 extra space. + +![](images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg) + +Also considering read amplification (below the GC horizon). When a read request reaches the GC horizon, the read amplification will be (A+B+C)/C=1+(A+B)/C. Reducing ***A*** and ***B*** can help reduce the read amplification below the GC horizon. + +The metrics-based trigger will wait until a point that space amplification is not that large and write amplification is not that large before the compaction gets triggered. The trigger is defined as **(A+B)/C ≥ 1 (or some other ratio)**. + +To reason about this trigger, consider the two cases: + +**Data Ingestion** + +User keeps ingesting data into the database, which indicates that WAL size roughly equals to the database logical size. The compaction gets triggered only when the newly-written WAL roughly equals to the current bottom-most image size (=X). Therefore, it’s triggered when the database size gets doubled. This is a reasonable amount of work. Write amplification is 2X/X=1 for the X amount of data written. + +![](images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg) + +**Updates/Deletion** + +In this case, WAL size will be larger than the database logical size ***D***. The compaction gets triggered for every ***D*** bytes of WAL written. Therefore, for every ***D*** bytes of WAL, we rewrite the bottom-most layer, which produces an extra ***D*** bytes of write amplification. This incurs exactly 2x write amplification (by the write of D), 1.5x write amplification (if we count from the start of the process) and no space amplification. + +![](images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg) + +Note that here I try to reason that write amplification is a constant (i.e., the data we write to S3 is proportional to the data the user write). The main problem with the current legacy compaction algorithm is that write amplification is proportional to the database size. + +The next step is to optimize the write amplification above the GC horizon (i.e., change the image creation criteria, top-most compaction, or introduce tiered compaction), to ensure the write amplification of the whole system is a constant factor. + +20GB layers → +20GB layers → delete 20GB, need 40GB temporary space + +# Sub-Compactions + +The gc-compaction algorithm may take a long time and we need to split the job into multiple sub-compaction jobs. + +![](images/036-bottom-most-gc-compaction/13-job-split.svg) + +As in the figure, the auto-trigger schedules a compaction job covering the full keyspace below a specific LSN. In such case that we cannot finish compacting it in one run in a reasonable amount of time, the algorithm will vertically split it into multiple jobs (in this case, 5). + +Each gc-compaction job will create one level of delta layers and one flat level of image layers for each LSN. Those layers will be automatically split based on size, which means that if the sub-compaction job produces 1GB of deltas, it will produce 4 * 256MB delta layers. For those layers that is not fully contained within the sub-compaction job rectangles, it will be rewritten to only contain the keys outside of the key range. + +# Implementation + +The main implementation of gc-compaction is in `compaction.rs`. + +* `compact_with_gc`: The main loop of gc-compaction. It takes a rectangle range of the layer map and compact that specific range. It selects layers intersecting with the rectangle, downloads the layers, creates the k-merge iterator to read those layers in the key-lsn order, and decide which keys to keep or insert a reconstructed page. The process is the basic unit of a gc-compaction and is not interruptable. If the process gets preempted by L0 compaction, it has to be restarted from scratch. For layers overlaps with the rectangle but not fully inside, the main loop will also rewrite them so that the new layer (or two layers if both left and right ends are outside of the rectangle) has the same LSN range as the original one but only contain the keys outside of the compaction range. +* `gc_compaction_split_jobs`: Splits a big gc-compaction job into sub-compactions based on heuristics in the layer map. The function looks at the layer map and splits the compaction job based on the size of the layers so that each compaction job only pulls ~4GB of layer files. +* `generate_key_retention` and `KeyHistoryRetention`: Implements the algorithm described in the "basic idea" and "branch" chapter of this RFC. It takes a vector of history of a key (key-lsn-value) and decides which LSNs of the key to retain. If there are too many deltas between two retain_lsns, it will reconstruct the page and insert an image into the compaction result. Also, we implement `KeyHistoryRetention::verify` to ensure the generated result is not corrupted -- all retain_lsns and all LSNs above the gc-horizon should be accessible. +* `GcCompactionQueue`: the automatic trigger implementation for gc-compaction. `GcCompactionQueue::iteration` is called at the end of the tenant compaction loop. It will then call `trigger_auto_compaction` to decide whether to trigger a gc-compaction job for this tenant. If yes, the compaction-job will be added to the compaction queue, and the queue will be slowly drained once there are no other compaction jobs running. gc-compaction has the lowest priority. If a sub-compaction job is not successful or gets preempted by L0 compaction (see limitations for reasons why a compaction job would fail), it will _not_ be retried. +* Changes to `index_part.json`: we added a `last_completed_lsn` field to the index part for the auto-trigger to decide when to trigger a compaction. +* Changes to the read path: when gc-compaction updates the layer map, all reads need to wait. See `gc_compaction_layer_update_lock` and comments in the code path for more information. + +Gc-compaction can also be scheduled over the HTTP API. Example: + +``` +curl 'localhost:9898/v1/tenant/:tenant_id/timeline/:timeline_id/compact?enhanced_gc_bottom_most_compaction=true&dry_run=true' -X PUT -H "Content-Type: application/json" -d '{"scheduled": true, "compact_key_range": { "start": "000000067F0000A0000002A1CF0100000000", "end": "000000067F0000A0000002A1D70100000000" } }' +``` + +The `dry_run` mode can be specified in the query string so that the compaction will go through all layers to estimate how much space can be saved without writing the compaction result into the layer map. + +The auto-trigger is controlled by tenant-level flag `gc_compaction_enabled`. If this is set to false, no gc-compaction will be automatically scheduled on this tenant (but manual trigger still works). + +# Next Steps + +There are still some limitations of gc-compaction itself that needs to be resolved and tested, + +- gc-compaction is currently only automatically triggered on root branches. We have not tested gc-compaction on child branches in staging. +- gc-compaction will skip aux key regions because of the possible conflict with the assumption of aux file tombstones. +- gc-compaction does not consider keyspaces at retain_lsns and only look at keys in the layers. This also causes us giving up some sub-compaction jobs because a key might have part of its history available due to traditional GC removing part of the history. +- We limit gc-compaction to run over shards <= 150GB to avoid gc-compaction taking too much time blocking other compaction jobs. The sub-compaction split algorithm needs to be improved to be able to split vertically and horizontally. Also, we need to move the download layer process out of the compaction loop so that we don't block other compaction jobs for too long. +- The compaction trigger always schedules gc-compaction from the lowest LSN to the gc-horizon. Currently we do not schedule compaction jobs that only selects layers in the middle. Allowing this could potentially reduce the number of layers read/write throughout the process. +- gc-compaction will give up if there are too many layers to rewrite or if there are not enough disk space for the compaction. +- gc-compaction sometimes fails with "no key produced during compaction", which means that all existing keys within the compaction range can be collected; but we don't have a way to write this information back to the layer map -- we cannot generate an empty image layer. +- We limit the maximum size of deltas for a single key to 512MB. If above this size, gc-compaction will give up. This can be resolved by changing `generate_key_retention` to be a stream instead of requiring to collect all the key history. + +In the future, + +- Top-most compaction: ensure we always have an image coverage for the latest data (or near the latest data), so that reads will be fast at the latest LSN. +- Tiered compaction on deltas: ensure read from any LSN is fast. +- Per-timeline compaction → tenant-wide compaction? diff --git a/docs/rfcs/2025-04-30-direct-io-for-pageserver.md b/docs/rfcs/2025-04-30-direct-io-for-pageserver.md new file mode 100644 index 0000000000..847f5e4040 --- /dev/null +++ b/docs/rfcs/2025-04-30-direct-io-for-pageserver.md @@ -0,0 +1,362 @@ +# Direct IO For Pageserver + +Date: Apr 30, 2025 + +## Summary + +This document is a retroactive RFC. It +- provides some background on what direct IO is, +- motivates why Pageserver should be using it for its IO, and +- describes how we changed Pageserver to use it. + +The [initial proposal](https://github.com/neondatabase/neon/pull/8240) that kicked off the work can be found in this closed GitHub PR. + +People primarily involved in this project were: +- Yuchen Liang +- Vlad Lazar +- Christian Schwarz + +## Timeline + +For posterity, here is the rough timeline of the development work that got us to where we are today. + +- Jan 2024: [integrate `tokio-epoll-uring`](https://github.com/neondatabase/neon/pull/5824) along with owned buffers API +- March 2024: `tokio-epoll-uring` enabled in all regions in buffered IO mode +- Feb 2024 to June 2024: PS PageCache Bypass For Data Blocks + - Feb 2024: [Vectored Get Implementation](https://github.com/neondatabase/neon/pull/6576) bypasses delta & image layer blocks for page requests + - Apr to June 2024: [Epic: bypass PageCache for use data blocks](https://github.com/neondatabase/neon/issues/7386) addresses remaining users +- Aug to Nov 2024: direct IO: first code; preliminaries; read path coding; BufferedWriter; benchmarks show perf regressions too high, no-go. +- Nov 2024 to Jan 2025: address perf regressions by developing page_service pipelining (aka batching) and concurrent IO ([Epic](https://github.com/neondatabase/neon/issues/9376)) +- Feb to March 2024: rollout batching, then concurrent+direct IO => read path and InMemoryLayer is now direct IO +- Apr 2025: develop & roll out direct IO for the write path + +## Background: Terminology & Glossary + +**kernel page cache**: the Linux kernel's page cache is a write-back cache for filesystem contents. +The cached unit is memory-page-sized & aligned chunks of the files that are being cached (typically 4k). +The cache lives in kernel memory and is not directly accessible through userspace. + +**Buffered IO**: an application's read/write system calls go through the kernel page cache. +For example, a 10 byte sized read or write to offset 5000 in a file will load the file contents +at offset `[4096,8192)` into a free page in the kernel page cache. If necessary, it will evict +a page to make room (cf eviction). Then, the kernel performs a memory-to-memory copy of 10 bytes +from/to the offset `4` (`5000 = 4096 + 4`) within the cached page. If it's a write, the kernel keeps +track of the fact that the page is now "dirty" in some ancillary structure. + +**Writeback**: a buffered read/write syscall returns after the memory-to-memory copy. The modifications +made by e.g. write system calls are not even *issued* to disk, let alone durable. Instead, the kernel +asynchronously writes back dirtied pages based on a variety of conditions. For us, the most relevant +ones are a) explicit request by userspace (`fsync`) and b) memory pressure. + +**Memory pressure**: the kernel page cache is a best effort service and a user of spare memory capacity. +If there is no free memory, the kernel page allocator will take pages used by page cache to satisfy allocations. +Before reusing a page like that, the page has to be written back (writeback, see above). +The far-reaching consequence of this is that **any allocation of anonymous memory can do IO** if the only +way to get that memory is by eviction & re-using a dirty page cache page. +Notably, this includes a simple `malloc` in userspace, because eventually that boils down to `mmap(..., MAP_ANON, ...)`. +I refer to this effect as the "malloc latency backscatter" caused by buffered IO. + +**Direct IO** allows application's read/write system calls to bypass the kernel page cache. The filesystem +is still involved because it is ultimately in charge of mapping the concept of files & offsets within them +to sectors on block devices. Typically, the filesystem poses size and alignment requirements for memory buffers +and file offsets (statx `Dio_mem_align` / `Dio_offset_align`), see [this gist](https://gist.github.com/problame/1c35cac41b7cd617779f8aae50f97155). +The IO operations will fail at runtime with EINVAL if the alignment requirements are not met. + +**"buffered" vs "direct"**: the central distinction between buffered and direct IO is about who allocates and +fills the IO buffers, and who controls when exactly the IOs are issued. In buffered IO, it's the syscall handlers, +kernel page cache, and memory management subsystems (cf "writeback"). In direct IO, all of it is done by +the application. +It takes more effort by the application to program with direct instead of buffered IO. +The return is precise control over and a clear distinction between consumption/modification of memory vs disk. + +**Pageserver PageCache**: Pageserver has an additional `PageCache` (referred to as PS PageCache from here on, as opposed to "kernel page cache"). +Its caching unit is 8KiB blocks of the layer files written by Pageserver. +A miss in PageCache is filled by reading from the filesystem, through the `VirtualFile` abstraction layer. +The default size is tiny (64MiB), very much like Postgres's `shared_buffers`. +We ran production at 128MiB for a long time but gradually moved it up to 2GiB over the past ~year. + +**VirtualFile** is Pageserver's abstraction for file IO, very similar to the facility in Postgres that bears the same name. +Its historical purpose appears to be working around open file descriptor limitations, which is practically irrelevant on Linux. +However, the facility in Pageserver is useful as an intermediary layer for metrics and abstracts over the different kinds of +IO engines that Pageserver supports (`std-fs` vs `tokio-epoll-uring`). + +## Background: History Of Caching In Pageserver + +For multiple years, Pageserver's `PageCache` was on the path of all read _and write_ IO. +It performed write-back to the kernel using buffered IO. + +We converted it into a read-only cache of immutable data in [PR 4994](https://github.com/neondatabase/neon/pull/4994). + +The introduction of `tokio-epoll-uring` required converting the code base to used owned IO buffers. +The `PageCache` pages are usable as owned IO buffers. + +We then started bypassing PageCache for user data blocks. +Data blocks are the 8k blocks of data in layer files that hold the multiple `Value`s, as opposed to the disk btree index blocks that tell us which values exist in a file at what offsets. +The disk btree embedded in delta & image layers remains `PageCache`'d. +Epics for that work were: +- Vectored `Timeline::get` (cf RFC 30) skipped delta and image layer data block `PageCache`ing outright. +- Epic https://github.com/neondatabase/neon/issues/7386 took care of the remaining users for data blocks: + - Materialized page cache (cached materialized pages; shown to be ~0% hit rate in practice) + - InMemoryLayer + - Compaction + +The outcome of the above: +1. All data blocks are always read through the `VirtualFile` APIs, hitting the kernel buffered read path (=> kernel page cache). +2. Indirect blocks (=disk btree blocks) would be cached in the PS `PageCache`. + +In production we size the PS `PageCache` to be 2GiB. +Thus drives hit rate up to ~99.95% and the eviction rate / replacement rates down to less than 200/second on a 1-minute average, on the busiest machines. +High baseline replacement rates are treated as a signal of resource exhaustion (page cache insufficient to host working set of the PS). +The response to this is to migrate tenants away, or increase PS `PageCache` size. +It is currently manual but could be automated, e.g., in Storage Controller. + +In the future, we may eliminate the `PageCache` even for indirect blocks. +For example with an LRU cache that has as unit the entire disk btree content +instead of individual blocks. + +## High-Level Design + +So, before work on this project started, all data block reads and the entire write path of Pageserver were using kernel-buffered IO, i.e., the kernel page cache. +We now want to get the kernel page cache out of the picture by using direct IO for all interaction with the filesystem. +This achieves the following system properties: + +**Predictable VirtualFile latencies** +* With buffered IO, reads are sometimes fast, sometimes slow, depending on kernel page cache hit/miss. +* With buffered IO, appends when writing out new layer files during ingest or compaction are sometimes fast, sometimes slow because of write-back backpressure. +* With buffered IO, the "malloc backscatter" phenomenon pointed out in the Glossary section is not something we actively observe. + But we do have occasional spikes in Dirty memory amount and Memory PSI graphs, so it may already be affecting to some degree. +* By switching to direct IO, above operations will have the (predictable) device latency -- always. + Reads and appends always go to disk. + And malloc will not have to write back dirty data. + +**Explicitness & Tangibility of resource usage** +* In a multi-tenant system, it is generally desirable and valuable to be *explicit* about the main resources we use for each tenant. +* By using direct IO, we become explicit about the resources *disk IOPs* and *memory capacity* in a way that was previously being conflated through the kernel page cache, outside our immediate control. +* We will be able to build per-tenant observability of resource usage ("what tenant is causing the actual IOs that are sent to the disk?"). +* We will be able to build accounting & QoS by implementing an IO scheduler that is tenant aware. The kernel is not tenant-aware and can't do that. + +**CPU Efficiency** +* The involvement of the kernel page cache means one additional memory-to-memory copy on read and write path. +* Direct IO will eliminate that memory-to-memory copy, if we can make the userspace buffers used for the IO calls satisfy direct IO alignment requirements. + +The **trade-off** is that we no longer get the theoretical benefits of the kernel page cache. These are: +- read latency improvements for repeat reads of the same data ("locality of reference") + - asterisk: only if that state is still cache-resident by time of next access +- write throughput by having kernel page cache batch small VFS writes into bigger disk writes + - asterisk: only if memory pressure is low enough that the kernel can afford to delay writeback + +We are **happy to make this trade-off**: +- Because of the advantages listed above. +- Because we empirically have enough DRAM on Pageservers to serve metadata (=index blocks) from PS PageCache. + (At just 2GiB PS PageCache size, we average a 99.95% hit rate). + So, the latency of going to disk is only for data block reads, not the index traversal. +- Because **the kernel page cache is ineffective** at high tenant density anyway (#tenants/pageserver instance). + And because dense packing of tenants will always be desirable to drive COGS down, we should design the system for it. + (See the appendix for a more detailed explanation why this is). +- So, we accept that some reads that used to be fast by circumstance will have higher but **predictable** latency than before. + +### Desired End State + +The desired end state of the project is as follows, and with some asterisks, we have achieved it. + +All IOs of the Pageserver data path use direct IO, thereby bypassing the kernel page cache. + +In particular, the "data path" includes +- the wal ingest path +- compaction +- anything on the `Timeline::get` / `Timeline::get_vectored` path. + +The production Pageserver config is tuned such that virtually all non-data blocks are cached in the PS PageCache. +Hit rate target is 99.95%. + +There are no regressions to ingest latency. + +The total "wait-for-disk time" contribution to random getpage request latency is `O(1 read IOP latency)`. +We accomplish that by having a near 100% PS PageCache hit rate so that layer index traversal effectively never needs not wait for IO. +Thereby, it can issue all the data blocks as it traverses the index, and only wait at the end of it (concurrent IO). + +The amortized "wait-for-disk time" contribution of this direct IO proposal to a series of sequential getpage requests is `1/32 * read IOP latency` for each getpage request. +We accomplish this by server-side batching of up to 32 reads into a single `Timeline::get_vectored` call. +(This is an ideal world where our batches are full - that's not the case in prod today because of lack of queue depth). + +## Design & Implementation + +### Prerequisites + +A lot of prerequisite work had to happen to enable use of direct IO. + +To meet the "wait-for-disk time" requirements from the DoD, we implement for the read path: +- page_service level server-side batching (config field `page_service_pipelining`) +- concurrent IO (config field `get_vectored_concurrent_io`) +The work for both of these these was tracked [in the epic](https://github.com/neondatabase/neon/issues/9376). +Server-side batching will likely be obsoleted by the [#proj-compute-communicator](https://github.com/neondatabase/neon/pull/10799). +The Concurrent IO work is described in retroactive RFC `2025-04-30-pageserver-concurrent-io-on-read-path.md`. +The implementation is relatively brittle and needs further investment, see the `Future Work` section in that RFC. + +For the write path, and especially WAL ingest, we need to hide write latency. +We accomplish this by implementing a (`BufferedWriter`) type that does double-buffering: flushes of the filled +buffer happen in a sidecar tokio task while new writes fill a new buffer. +We refactor InMemoryLayer as well as BlobWriter (=> delta and image layer writers) to use this new `BufferedWriter`. +The most comprehensive write-up of this work is in [the PR description](https://github.com/neondatabase/neon/pull/11558). + +### Ensuring Adherence to Alignment Requirements + +Direct IO puts requirements on +- memory buffer alignment +- io size (=memory buffer size) +- file offset alignment + +The requirements are specific to a combination of filesystem/block-device/architecture(hardware page size!). + +In Neon production environments we currently use ext4 with Linux 6.1.X on AWS and Azure storage-optimized instances (locally attached NVMe). +Instead of dynamic discovery using `statx`, we statically hard-code 512 bytes as the buffer/offset alignment and size-multiple. +We made this decision because: +- a) it is compatible with all the environments we need to run in +- b) our primary workload can be small-random-read-heavy (we do merge adjacent reads if possible, but the worst case is that all `Value`s that needs to be read are far apart) +- c) 512-byte tail latency on the production instance types is much better than 4k (p99.9: 3x lower, p99.99 5x lower). +- d) hard-coding at compile-time allows us to use the Rust type system to enforce the use of only aligned IO buffers, eliminating a source of runtime errors typically associated with direct IO. + +This was [discussed here](https://neondb.slack.com/archives/C07BZ38E6SD/p1725036790965549?thread_ts=1725026845.455259&cid=C07BZ38E6SD). + +The new `IoBufAligned` / `IoBufAlignedMut` marker traits indicate that a given buffer meets memory alignment requirements. +All `VirtualFile` APIs and several software layers built on top of them only accept buffers that implement those traits. +Implementors of the marker traits are: +- `IoBuffer` / `IoBufferMut`: used for most reads and writes +- `PageWriteGuardBuf`: for filling PS PageCache pages (index blocks!) + +The alignment requirement is infectious; it permeates bottom-up throughout the code base. +We stop the infection at roughly the same layers in the code base where we stopped permeating the +use of owned-buffers-style API for tokio-epoll-uring. The way the stopping works is by introducing +a memory-to-memory copy from/to some unaligned memory location on the stack/current/heap. +The places where we currently stop permeating are sort of arbitrary. For example, it would probably +make sense to replace more usage of `Bytes` that we know holds 8k pages with 8k-sized `IoBuffer`s. + +The `IoBufAligned` / `IoBufAlignedMut` types do not protect us from the following types of runtime errors: +- non-adherence to file offset alignment requirements +- non-adherence to io size requirements + +The following higher-level constructs ensure we meet the requirements: +- read path: the `ChunkedVectoredReadBuilder` and `mod vectored_dio_read` ensure reads happen at aligned offsets and in appropriate size multiples. +- write path: `BufferedWriter` only writes in multiples of the capacity, at offsets that are `start_offset+N*capacity`; see its doc comment. + +Note that these types are used always, regardless of whether direct IO is enabled or not. +There are some cases where this adds unnecessary overhead to buffered IO (e.g. all memcpy's inflated to multiples of 512). +But we could not identify meaningful impact in practice when we shipped these changes while we were still using buffered IO. + +### Configuration / Feature Flagging + +In the previous section we described how all users of VirtualFile were changed to always adhere to direct IO alignment and size-multiple requirements. +To actually enable direct IO, all we need to do is set the `O_DIRECT` flag in `open` syscalls / io_uring operations. + +We set `O_DIRECT` based on: +- the VirtualFile API used to create/open the VirtualFile instance +- the `virtual_file_io_mode` configuration flag +- the OpenOptions `read` and/or `write` flags. + +The VirtualFile APIs suffixed with `_v2` are the only ones that _may_ open with `O_DIRECT` depending on the other two factors in above list. +Other APIs never use `O_DIRECT`. +(The name is bad and should really be `_maybe_direct_io`.) + +The reason for having new APIs is because all code used VirtualFile but implementation and rollout happened in consecutive phases (read path, InMemoryLayer, write path). +At the VirtualFile level, context on whether an instance of VirtualFile is on read path, InMemoryLayer, or write path is not available. + +The `_v2` APIs then check make the decision to set `O_DIRECT` based on the `virtual_file_io_mode` flag and the OpenOptions `read`/`write` flags. +The result is the following runtime behavior: + +|what|OpenOptions|`v_f_io_mode`
=`buffered`|`v_f_io_mode`
=`direct`|`v_f_io_mode`
=`direct-rw`| +|-|-|-|-|-| +|`DeltaLayerInner`|read|()|O_DIRECT|O_DIRECT| +|`ImageLayerInner`|read|()|O_DIRECT|O_DIRECT| +|`InMemoryLayer`|read + write|()|()*|O_DIRECT| +|`DeltaLayerWriter`| write | () | () | O_DIRECT | +|`ImageLayerWriter`| write | () | () | O_DIRECT | +|`download_layer_file`|write |()|()|O_DIRECT| + +The `InMemoryLayer` is marked with `*` because there was a period when it *did* use O_DIRECT under `=direct`. +That period was when we implemented and shipped the first version of `BufferedWriter`. +We used it in `InMemoryLayer` and `download_layer_file` but it was only sensitive to `v_f_io_mode` in `InMemoryLayer`. +The introduction of `=direct-rw`, and the switch of the remaining write path to `BufferedWriter`, happened later, +in https://github.com/neondatabase/neon/pull/11558. + +Note that this way of feature flagging inside VirtualFile makes it less and less a general purpose POSIX file access abstraction. +For example, with `=direct-rw` enabled, it is no longer possible to open a `VirtualFile` without `O_DIRECT`. It'll always be set. + +## Correctness Validation + +The correctness risks with this project were: +- Memory safety issues in the `IoBuffer` / `IoBufferMut` implementation. + These types expose an API that is largely identical to that of the `bytes` crate and/or Vec. +- Runtime errors (=> downtime / unavailability) because of non-adherence to alignment/size-multiple requirements, resulting in EINVAL on the read path. + +We sadly do not have infrastructure to run pageserver under `cargo miri`. +So for memory safety issues, we relied on careful peer review. + +We do assert the production-like alignment requirements in testing builds. +However, these asserts were added retroactively. +The actual validation before rollout happened in staging and pre-prod. +We eventually enabled `=direct`/`=direct-rw` for Rust unit tests and the regression test suite. +I cannot recall a single instance of staging/pre-prod/production errors caused by non-adherence to alignment/size-multiple requirements. +Evidently developer testing was good enough. + +## Performance Validation + +The read path went through a lot of iterations of benchmarking in staging and pre-prod. +The benchmarks in those environments demonstrated performance regressions early in the implementation. +It was actually this performance testing that made us implement batching and concurrent IO to avoid unacceptable regressions. + +The write path was much quicker to validate because `bench_ingest` covered all of the (less numerous) access patterns. + +## Future Work + +There is minor and major follow-up work that can be considered in the future. +Check the (soon-to-be-closed) Epic https://github.com/neondatabase/neon/issues/8130's "Follow-Ups" section for a current list. + +Read Path: +- PS PageCache hit rate is crucial to unlock concurrent IO and reasonable latency for random reads generally. + Instead of reactively sizing PS PageCache, we should estimate the required PS PageCache size + and potentially also use that to drive placement decisions of shards from StorageController + https://github.com/neondatabase/neon/issues/9288 +- ... unless we get rid of PS PageCache entirely and cache the index block in a more specialized cache. + But even then, an estimation of the working set would be helpful to figure out caching strategy. + +Write Path: +- BlobWriter and its users could switch back to a borrowed API https://github.com/neondatabase/neon/issues/10129 +- ... unless we want to implement bypass mode for large writes https://github.com/neondatabase/neon/issues/10101 +- The `TempVirtualFile` introduced as part of this project could internalize more of the common usage pattern: https://github.com/neondatabase/neon/issues/11692 +- Reduce conditional compilation around `virtual_file_io_mode`: https://github.com/neondatabase/neon/issues/11676 + +Both: +- A performance simulation mode that pads VirtualFile op latencies to typical NVMe latencies, even if the underlying storage is faster. + This would avoid misleadingly good performance on developer systems and in benchmarks on systems that are less busy than production hosts. + However, padding latencies at microsecond scale is non-trivial. + +Misc: +- We should finish trimming VirtualFile's scope to be truly limited to core data path read & write. + Abstractions for reading & writing pageserver config, location config, heatmaps, etc, should use + APIs in a different package (`VirtualFile::crashsafe_overwrite` and `VirtualFile::read_to_string` + are good entrypoints for cleanup.) https://github.com/neondatabase/neon/issues/11809 + +# Appendix + +## Why Kernel Page Cache Is Ineffective At Tenant High Density + +In the Motivation section, we stated: + +> - **The kernel page cache ineffective** at high tenant density anyways (#tenants/pageserver instance). + +The reason is that the Pageserver workload sent from Computes is whatever is a Compute cache(s) miss. +That's either sequential scans or random reads. +A random read workload simply causes cache thrashing because a packed Pageserver NVMe drive (`im4gn.2xlarge`) has ~100x more capacity than DRAM available. +It is complete waste to have the kernel page cache cache data blocks in this case. +Sequential read workloads *can* benefit iff those pages have been updated recently (=no image layer yet) and together in time/LSN space. +In such cases, the WAL records of those updates likely sit on the same delta layer block. +When Compute does a sequential scan, it sends a series of single-page requests for these individual pages. +When Pageserver processes the second request in such a series, it goes to the same delta layer block and have a kernel page cache hit. +This dependence on kernel page cache for sequential scan performance is significant, but the solution is at a higher level than generic data block caching. +We can either add a small per-connection LRU cache for such delta layer blocks. +Or we can merge those sequential requests into a larger vectored get request, which is designed to never read a block twice. +This amortizes the read latency for our delta layer block across the vectored get batch size (which currently is up to 32). + +There are Pageserver-internal workloads that do sequential access (compaction, image layer generation), but these +1. are not latency-critical and can do batched access outside of the `page_service` protocol constraints (image layer generation) +2. don't actually need to reconstruct images and therefore can use totally different access methods (=> compaction can use k-way merge iterators with their own internal buffering / prefetching). diff --git a/docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md b/docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md new file mode 100644 index 0000000000..2dc937d298 --- /dev/null +++ b/docs/rfcs/2025-04-30-pageserver-concurrent-io-on-read-path.md @@ -0,0 +1,251 @@ +# Concurrent IO for Pageserver Read Path + +Date: May 6, 2025 + +## Summary + +This document is a retroactive RFC on the Pageserver Concurrent IO work that happened in late 2024 / early 2025. + +The gist of it is that Pageserver's `Timeline::get_vectored` now _issues_ the data block read operations against layer files +_as it traverses the layer map_ and only _wait_ once, for all of them, after traversal is complete. + +Assuming a good PS PageCache hits on the index blocks during traversal, this drives down the "wait-for-disk" time +contribution down from `random_read_io_latency * O(number_of_values)` to `random_read_io_latency * O(1 + traversal)`. + +The motivation for why this work had to happen when it happened was the switch of Pageserver to +- not cache user data blocks in PS PageCache and +- switch to use direct IO. +More context on this are given in complimentary RFC `./rfcs/2025-04-30-direct-io-for-pageserver.md`. + +### Refs + +- Epic: https://github.com/neondatabase/neon/issues/9378 +- Prototyping happened during the Lisbon 2024 Offsite hackathon: https://github.com/neondatabase/neon/pull/9002 +- Main implementation PR with good description: https://github.com/neondatabase/neon/issues/9378 + +Design and implementation by: +- Vlad Lazar +- Christian Schwarz + +## Background & Motivation + +The Pageserver read path (`Timeline::get_vectored`) consists of two high-level steps: +- Retrieve the delta and image `Value`s required to reconstruct the requested Page@LSN (`Timeline::get_values_reconstruct_data`). +- Pass these values to walredo to reconstruct the page images. + +The read path used to be single-key but has been made multi-key some time ago. +([Internal tech talk by Vlad](https://drive.google.com/file/d/1vfY24S869UP8lEUUDHRWKF1AJn8fpWoJ/view?usp=drive_link)) +However, for simplicity, most of this doc will explain things in terms of a single key being requested. + +The `Value` retrieval step above can be broken down into the following functions: +- **Traversal** of the layer map to figure out which `Value`s from which layer files are required for the page reconstruction. +- **Read IO Planning**: planning of the read IOs that need to be issued to the layer files / filesystem / disk. + The main job here is to coalesce the small value reads into larger filesystem-level read operations. + This layer also takes care of direct IO alignment and size-multiple requirements (cf the RFC for details.) + Check `struct VectoredReadPlanner` and `mod vectored_dio_read` for how it's done. +- **Perform the read IO** using `tokio-epoll-uring`. + +Before this project, above functions were sequentially interleaved, meaning: +1. we would advance traversal, ... +2. discover, that we need to read a value, ... +3. read it from disk using `tokio-epoll-uring`, ... +4. goto 1 unless we're done. + +This meant that if N `Value`s need to be read to reconstruct a page, +the time we spend waiting for disk will be we `random_read_io_latency * O(number_of_values)`. + +## Design + +The **traversal** and **read IO Planning** jobs still happen sequentially, layer by layer, as before. +But instead of performing the read IOs inline, we submit the IOs to a concurrent tokio task for execution. +After the last read from the last layer is submitted, we wait for the IOs to complete. + +Assuming the filesystem / disk is able to actually process the submitted IOs without queuing, +we arrive at _time spent waiting for disk_ ~ `random_read_io_latency * O(1 + traversal)`. + +Note this whole RFC is concerned with the steady state where all layer files required for reconstruction are resident on local NVMe. +Traversal will stall on on-demand layer download if a layer is not yet resident. +It cannot proceed without the layer being resident beccause its next step depends on the contents of the layer index. + +### Avoiding Waiting For IO During Traversal + +The `traversal` component in above time-spent-waiting-for-disk estimation is dominant and needs to be minimized. + +Before this project, traversal needed to perform IOs for the following: +1. The time we are waiting on PS PageCache to page in the visited layers' disk btree index blocks. +2. When visiting a delta layer, reading the data block that contains a `Value` for a requested key, + to determine whether the `Value::will_init` the page and therefore traversal can stop for this key. + +The solution for (1) is to raise the PS PageCache size such that the hit rate is practically 100%. +(Check out the `Background: History Of Caching In Pageserver` section in the RFC on Direct IO for more details.) + +The solution for (2) is source `will_init` from the disk btree index keys, which fortunately +already encode this bit of information since the introduction of the current storage/layer format. + +### Concurrent IOs, Submission & Completion + +To separate IO submission from waiting for its completion, +we introduce the notion of an `IoConcurrency` struct through which IOs are issued. + +An IO is an opaque future that +- captures the `tx` side of a `oneshot` channel +- performs the read IO by calling `VirtualFile::read_exact_at().await` +- sending the result into the `tx` + +Issuing an IO means `Box`ing the future above and handing that `Box` over to the `IoConcurrency` struct. + +The traversal code that submits the IO stores the the corresponding `oneshot::Receiver` +in the `VectoredValueReconstructState`, in the the place where we previously stored +the sequentially read `img` and `records` fields. + +When we're done with traversal, we wait for all submitted IOs: +for each key, there is a future that awaits all the `oneshot::Receiver`s +for that key, and then calls into walredo to reconstruct the page image. +Walredo is now invoked concurrently for each value instead of sequentially. +Walredo itself remains unchanged. + +The spawned IO futures are driven to completion by a sidecar tokio task that +is separate from the task that performs all the layer visiting and spawning of IOs. +That tasks receives the IO futures via an unbounded mpsc channel and +drives them to completion inside a `FuturedUnordered`. + +### Error handling, Panics, Cancellation-Safety + +There are two error classes during reconstruct data retrieval: +* traversal errors: index lookup, move to next layer, and the like +* value read IO errors + +A traversal error fails the entire `get_vectored` request, as before this PR. +A value read error only fails reconstruction of that value. + +Panics and dropping of the `get_vectored` future before it completes +leaves the sidecar task running and does not cancel submitted IOs +(see next section for details on sidecar task lifecycle). +All of this is safe, but, today's preference in the team is to close out +all resource usage explicitly if possible, rather than cancelling + forgetting +about it on drop. So, there is warning if we drop a +`VectoredValueReconstructState`/`ValuesReconstructState` that still has uncompleted IOs. + +### Sidecar Task Lifecycle + +The sidecar tokio task is spawned as part of the `IoConcurrency::spawn_from_conf` struct. +The `IoConcurrency` object acts as a handle through which IO futures are submitted. + +The spawned tokio task holds the `Timeline::gate` open. +It is _not_ sensitive to `Timeline::cancel`, but instead to the `IoConcurrency` object being dropped. + +Once the `IoConcurrency` struct is dropped, no new IO futures can come in +but already submitted IO futures will be driven to completion regardless. +We _could_ safely stop polling these futures because `tokio-epoll-uring` op futures are cancel-safe. +But the underlying kernel and hardware resources are not magically freed up by that. +So, again, in the interest of closing out all outstanding resource usage, we make timeline shutdown wait for sidecar tasks and their IOs to complete. +Under normal conditions, this should be in the low hundreds of microseconds. + +It is advisable to make the `IoConcurrency` as long-lived as possible to minimize the amount of +tokio task churn (=> lower pressure on tokio). Generally this means creating it "high up" in the call stack. +The pain with this is that the `IoConcurrency` reference needs to be propagated "down" to +the (short-lived) functions/scope where we issue the IOs. +We would like to use `RequestContext` for this propagation in the future (issue [here](https://github.com/neondatabase/neon/issues/10460)). +For now, we just add another argument to the relevant code paths. + +### Feature Gating + +The `IoConcurrency` is an `enum` with two variants: `Sequential` and `SidecarTask`. + +The behavior from before this project is available through `IoConcurrency::Sequential`, +which awaits the IO futures in place, without "spawning" or "submitting" them anywhere. + +The `get_vectored_concurrent_io` pageserver config variable determines the runtime value, +**except** for the places that use `IoConcurrency::sequential` to get an `IoConcurrency` object. + +### Alternatives Explored & Caveats Encountered + +A few words on the rationale behind having a sidecar *task* and what +alternatives were considered but abandoned. + +#### Why We Need A Sidecar *Task* / Why Just `FuturesUnordered` Doesn't Work + +We explored to not have a sidecar task, and instead have a `FuturesUnordered` per +`Timeline::get_vectored`. We would queue all IO futures in it and poll it for the +first time after traversal is complete (i.e., at `collect_pending_ios`). + +The obvious disadvantage, but not showstopper, is that we wouldn't be submitting +IOs until traversal is complete. + +The showstopper however, is that deadlocks happen if we don't drive the +IO futures to completion independently of the traversal task. +The reason is that both the IO futures and the traversal task may hold _some_, +_and_ try to acquire _more_, shared limited resources. +For example, both the travseral task and IO future may try to acquire +* a `VirtualFile` file descriptor cache slot async mutex (observed during impl) +* a `tokio-epoll-uring` submission slot (observed during impl) +* a `PageCache` slot (currently this is not the case but we may move more code into the IO futures in the future) + +#### Why We Don't Do `tokio::task`-per-IO-future + +Another option is to spawn a short-lived `tokio::task` for each IO future. +We implemented and benchmarked it during development, but found little +throughput improvement and moderate mean & tail latency degradation. +Concerns about pressure on the tokio scheduler led us to abandon this variant. + +## Future Work + +In addition to what is listed here, also check the "Punted" list in the epic: +https://github.com/neondatabase/neon/issues/9378 + +### Enable `Timeline::get` + +The only major code path that still uses `IoConcurrency::sequential` is `Timeline::get`. +The impact is that roughly the following parts of pageserver do not benefit yet: +- parts of basebackup +- reads performed by the ingest path +- most internal operations that read metadata keys (e.g. `collect_keyspace`!) + +The solution is to propagate `IoConcurrency` via `RequestContext`:https://github.com/neondatabase/neon/issues/10460 + +The tricky part is to figure out at which level of the code the `IoConcurrency` is spawned (and added to the RequestContext). + +Also, propagation via `RequestContext` makes makes it harder to tell during development whether a given +piece of code uses concurrent vs sequential mode: one has to recurisvely walk up the call tree to find the +place that puts the `IoConcurrency` into the `RequestContext`. +We'd have to use `::Sequential` as the conservative default value in a fresh `RequestContext`, and add some +observability to weed out places that fail to enrich with a properly spanwed `IoConcurrency::spawn_from_conf`. + +### Concurrent On-Demand Downloads enabled by Detached Indices + +As stated earlier, traversal stalls on on-demand download because its next step depends on the contents of the layer index. +Once we have separated indices from data blocks (=> https://github.com/neondatabase/neon/issues/11695) +we will only need to stall if the index is not resident. The download of the data blocks can happen concurrently or in the background. For example: +- Move the `Layer::get_or_maybe_download().await` inside the IO futures. + This goes in the opposite direction of the next "future work" item below, but it's easy to do. +- Serve the IO future directly from object storage and dispatch the layer download + to some other actor, e.g., an actor that is responsible for both downloads & eviction. + +### New `tokio-epoll-uring` API That Separates Submission & Wait-For-Completion + +Instead of `$op().await` style API, it would be useful to have a different `tokio-epoll-uring` API +that separates enqueuing (without necessarily `io_uring_enter`ing the kernel each time), submission, +and then wait for completion. + +The `$op().await` API is too opaque, so we _have_ to stuff it into a `FuturesUnordered`. + +A split API as sketched above would allow traversal to ensure an IO operation is enqueued to the kernel/disk (and get back-pressure iff the io_uring squeue is full). +While avoiding spending of CPU cycles on processing of completions while we're still traversing. + +The idea gets muddied by the fact that we may self-deadlock if we submit too much without completing. +So, the submission part of the split API needs to process completions if squeue is full. + +In any way, this split API is precondition for the bigger issue with the design presented here, +which we dicsuss in the next section. + +### Opaque Futures Are Brittle + +The use of opaque futures to represent submitted IOs is a clever hack to minimize changes & allow for near-perfect feature-gating. +However, we take on **brittleness** because callers must guarantee that the submitted futures are independent. +By our experience, it is non-trivial to identify or rule out the interdependencies. +See the lengthy doc comment on the `IoConcurrency::spawn_io` method for more details. + +The better interface and proper subsystem boundary is a _descriptive_ struct of what needs to be done ("read this range from this VirtualFile into this buffer") +and get back a means to wait for completion. +The subsystem can thereby reason by its own how operations may be related; +unlike today, where the submitted opaque future can do just about anything. diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg new file mode 100644 index 0000000000..7107198c0a --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/01-basic-idea.svg @@ -0,0 +1,135 @@ + + + + + + 01-basic-idea + + + Layer 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + GC Horizon + + + + + Images + at earlier LSN + + + + + Deltas + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Deltas + + + + + Images + at GC LSN + + + + + + Deltas above GC Horizon + + + + + + Deltas below GC Horizon + + + + + + Deltas above GC Horizon + + + + + + Deltas and image below GC Horizon gets garbage-collected + + + + + WAL replay of deltas+image below GC Horizon + Reshuffle deltas + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg new file mode 100644 index 0000000000..792db6d69e --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/03-retain-lsn.svg @@ -0,0 +1,141 @@ + + + + + + + + + + + + 03-retain-lsn + + + Layer 1 + + + + + + + + + + + + + + + + + + GC Horizon + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + retain_lsn 1 + + + + + + + + retain_lsn 2 + + + + + + + + retain_lsn 3 + + + + + + + + retain_lsn 4 + + + + + + + + + Dependent Branch + + + + + retain_lsn 3 + + + + + + + + Branch GC Horizon + + + + + + + + + Partial Image Coverage + + + + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg new file mode 100644 index 0000000000..9593ed969e --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/05-btmgc-parent.svg @@ -0,0 +1,187 @@ + + + + + + 05-btmgc-parent + + + Layer 1 + + + + + Append C@0x30 + + + + + + + + + + + + + + + + GC Horizon + + + + + + + Append F@0x60 + + + + + + + + + + + + + + + Append E@0x50 + Append D@0x40 + + + + + + + + + + + + + + + A@0x10, Append B@0x20 + + + + + + + + + + + + + + + + 0x20 + + + + + 0x50 + + + + + + + + 0x40 + + + + + + + + GC Horizon + + + + + + + Append F@0x60 + + + + + + + Append E@0x50 + + + + + + + + 0x20 + + + + + 0x50 + + + + + + + + 0x40 + + + + + + + AB@0x20 + + + + + + + + + + + + + + + ABCD@0x40 + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg new file mode 100644 index 0000000000..b8a93d5b5f --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/06-btmgc-child.svg @@ -0,0 +1,184 @@ + + + + + + 06-btmgc-child + + + Layer 1 + + + + + + + + + Append P@0x30 + + + + + + + + + + + + + + + + GC Horizon + + + + + + + Append S@0x60 + + + + + + + + + + + + + + + Append R@0x50 + Append Q@0x40 + + + + + + + + + + + + + 0x50 + + + + + + + + 0x20 + + + + + + + AB@0x20 + + + + + + + + + + + + + Ancestor Branch + + + + + + + + + + + + GC Horizon + + + + + + + Append S@0x60 + + + + + 0x50 + + + + + + + + 0x20 + + + + + + + AB + @0x20 + + + + + + + + + + + + + Ancestor Branch + + + + + + + + + + + AB + PQR@0x50 + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg new file mode 100644 index 0000000000..65034226da --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/07-btmgc-analysis-1.svg @@ -0,0 +1,180 @@ + + + + + + 07-btmgc-analysis-1 + + + Layer 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + GC Horizon + + + + + Images + at earlier LSN + + + + + Deltas + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Deltas + + + + + Images + at GC LSN + + + + + + Deltas above GC Horizon + + + + + + Deltas below GC Horizon + + + + + + Deltas above GC Horizon + + + + + + Deltas and image below GC Horizon gets garbage-collected + + + + + size=A + + + + + + + + + + + + + + + + + + + + + + + + + + + + + size=B + + + + + size=C + + + + + A + + + + + + B + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg new file mode 100644 index 0000000000..16a17ec56e --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/08-optimization.svg @@ -0,0 +1,158 @@ + + + + + + 08-optimization + + + Layer 1 + + + + + + + + + + + + + + GC Horizon + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0x20 + + + + + + + + 0x50 + + + + + 0x60 + + + + + + + + 0x40 + + + + + + + + + + + + + + + + GC Horizon + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0x20 + + + + + 0x70 + + + + + + + + 0x40 + + + + + + + + + + + + + + + + 0x50 + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg new file mode 100644 index 0000000000..243f038c88 --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/09-btmgc-analysis-2.svg @@ -0,0 +1,184 @@ + + + + + + 09-btmgc-analysis-2 + + + Layer 1 + + + + + C + + + + + + + C + + + + + + + C + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + + GC Horizon + + + + + + + + + + + B + + + + + + + B + + + + + + + B + + + + + + + C + + + + + + + C + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + A + + + + + + + + GC Horizon + + + + + + + B + + + + + + + C + + + + + B + + + + + + C + + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg new file mode 100644 index 0000000000..1e49ec017b --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/10-btmgc-analysis-3.svg @@ -0,0 +1,81 @@ + + + + + + 10-btmgc-analysis-3 + + + Layer 1 + + + + + 1/5 X + + + + + + + 1/5 X + + + + + + + 1/5 X + + + + + + + 1/5 X + + + + + + + + GC Horizon + + + + + + + X + + + + + + + + + + + + GC Horizon + + + + + + + 2X + + + + + + + 1/5 X + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg new file mode 100644 index 0000000000..510d7a0c3e --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/11-btmgc-analysis-4.svg @@ -0,0 +1,81 @@ + + + + + + 11-btmgc-analysis-4 + + + Layer 1 + + + + + 1/5 D + + + + + + + 1/5 D + + + + + + + 1/5 D + + + + + + + 1/5 D + + + + + + + 1/5 D + + + + + + + + GC Horizon + + + + + + + D + + + + + + + + + + + + GC Horizon + + + + + + + D + + + + + diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png b/docs/rfcs/images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png new file mode 100644 index 0000000000..c106f3ee89 Binary files /dev/null and b/docs/rfcs/images/036-bottom-most-gc-compaction/12-staircase-test-gc-feedback.png differ diff --git a/docs/rfcs/images/036-bottom-most-gc-compaction/13-job-split.svg b/docs/rfcs/images/036-bottom-most-gc-compaction/13-job-split.svg new file mode 100644 index 0000000000..37c38c727c --- /dev/null +++ b/docs/rfcs/images/036-bottom-most-gc-compaction/13-job-split.svg @@ -0,0 +1,176 @@ + + + + + + gc-compaction-split + + + Layer 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + branch point + + + + + + + + last branch point + + + + + + Job 1 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + branch point + + + + + + + + last branch point + + + + + + Job 2 + + + + + + Job 3 + + + + + + Job 4 + + + + + + Job 5 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Delta Layer + + + + + + + Image Layer + + + + + diff --git a/endpoint_storage/src/app.rs b/endpoint_storage/src/app.rs index 0bd7fe5f28..f44efe6d7a 100644 --- a/endpoint_storage/src/app.rs +++ b/endpoint_storage/src/app.rs @@ -462,6 +462,8 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH if var(REAL_S3_ENV).is_ok() { assert!(body.contains("remote_storage_s3_deleted_objects_total")); } + + #[cfg(target_os = "linux")] assert!(body.contains("process_threads")); } diff --git a/libs/neon-shmem/Cargo.toml b/libs/neon-shmem/Cargo.toml new file mode 100644 index 0000000000..2a636bec40 --- /dev/null +++ b/libs/neon-shmem/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "neon-shmem" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +thiserror.workspace = true +nix.workspace=true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } + +[target.'cfg(target_os = "macos")'.dependencies] +tempfile = "3.14.0" diff --git a/libs/neon-shmem/src/lib.rs b/libs/neon-shmem/src/lib.rs new file mode 100644 index 0000000000..e1b14b1371 --- /dev/null +++ b/libs/neon-shmem/src/lib.rs @@ -0,0 +1,418 @@ +//! Shared memory utilities for neon communicator + +use std::num::NonZeroUsize; +use std::os::fd::{AsFd, BorrowedFd, OwnedFd}; +use std::ptr::NonNull; +use std::sync::atomic::{AtomicUsize, Ordering}; + +use nix::errno::Errno; +use nix::sys::mman::MapFlags; +use nix::sys::mman::ProtFlags; +use nix::sys::mman::mmap as nix_mmap; +use nix::sys::mman::munmap as nix_munmap; +use nix::unistd::ftruncate as nix_ftruncate; + +/// ShmemHandle represents a shared memory area that can be shared by processes over fork(). +/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's +/// specified at creation. +/// +/// The area is backed by an anonymous file created with memfd_create(). The full address space for +/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`], +/// the underlying file is resized. Do not access the area beyond the current size. Currently, that +/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the +/// future. +pub struct ShmemHandle { + /// memfd file descriptor + fd: OwnedFd, + + max_size: usize, + + // Pointer to the beginning of the shared memory area. The header is stored there. + shared_ptr: NonNull, + + // Pointer to the beginning of the user data + pub data_ptr: NonNull, +} + +/// This is stored at the beginning in the shared memory area. +struct SharedStruct { + max_size: usize, + + /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag + current_size: AtomicUsize, +} + +const RESIZE_IN_PROGRESS: usize = 1 << 63; + +const HEADER_SIZE: usize = std::mem::size_of::(); + +/// Error type returned by the ShmemHandle functions. +#[derive(thiserror::Error, Debug)] +#[error("{msg}: {errno}")] +pub struct Error { + pub msg: String, + pub errno: Errno, +} + +impl Error { + fn new(msg: &str, errno: Errno) -> Error { + Error { + msg: msg.to_string(), + errno, + } + } +} + +impl ShmemHandle { + /// Create a new shared memory area. To communicate between processes, the processes need to be + /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes. + /// + /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other + /// processes can continue using it, however. + pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result { + // create the backing anonymous file. + let fd = create_backing_file(name)?; + + Self::new_with_fd(fd, initial_size, max_size) + } + + fn new_with_fd( + fd: OwnedFd, + initial_size: usize, + max_size: usize, + ) -> Result { + // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size + // is a little larger than this because of the SharedStruct header. Make the upper limit + // somewhat smaller than that, because with anything close to that, you'll run out of + // memory anyway. + if max_size >= 1 << 48 { + panic!("max size {} too large", max_size); + } + if initial_size > max_size { + panic!("initial size {initial_size} larger than max size {max_size}"); + } + + // The actual initial / max size is the one given by the caller, plus the size of + // 'SharedStruct'. + let initial_size = HEADER_SIZE + initial_size; + let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap(); + + // Reserve address space for it with mmap + // + // TODO: Use MAP_HUGETLB if possible + let start_ptr = unsafe { + nix_mmap( + None, + max_size, + ProtFlags::PROT_READ | ProtFlags::PROT_WRITE, + MapFlags::MAP_SHARED, + &fd, + 0, + ) + } + .map_err(|e| Error::new("mmap failed: {e}", e))?; + + // Reserve space for the initial size + enlarge_file(fd.as_fd(), initial_size as u64)?; + + // Initialize the header + let shared: NonNull = start_ptr.cast(); + unsafe { + shared.write(SharedStruct { + max_size: max_size.into(), + current_size: AtomicUsize::new(initial_size), + }) + }; + + // The user data begins after the header + let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) }; + + Ok(ShmemHandle { + fd, + max_size: max_size.into(), + shared_ptr: shared, + data_ptr, + }) + } + + // return reference to the header + fn shared(&self) -> &SharedStruct { + unsafe { self.shared_ptr.as_ref() } + } + + /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified + /// when creating the area. + /// + /// This may only be called from one process/thread concurrently. We detect that case + /// and return an Error. + pub fn set_size(&self, new_size: usize) -> Result<(), Error> { + let new_size = new_size + HEADER_SIZE; + let shared = self.shared(); + + if new_size > self.max_size { + panic!( + "new size ({} is greater than max size ({})", + new_size, self.max_size + ); + } + assert_eq!(self.max_size, shared.max_size); + + // Lock the area by setting the bit in 'current_size' + // + // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory + // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But + // since this is not performance-critical, better safe than sorry . + let mut old_size = shared.current_size.load(Ordering::Acquire); + loop { + if (old_size & RESIZE_IN_PROGRESS) != 0 { + return Err(Error::new( + "concurrent resize detected", + Errno::UnknownErrno, + )); + } + match shared.current_size.compare_exchange( + old_size, + new_size, + Ordering::Acquire, + Ordering::Relaxed, + ) { + Ok(_) => break, + Err(x) => old_size = x, + } + } + + // Ok, we got the lock. + // + // NB: If anything goes wrong, we *must* clear the bit! + let result = { + use std::cmp::Ordering::{Equal, Greater, Less}; + match new_size.cmp(&old_size) { + Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| { + Error::new("could not shrink shmem segment, ftruncate failed: {e}", e) + }), + Equal => Ok(()), + Greater => enlarge_file(self.fd.as_fd(), new_size as u64), + } + }; + + // Unlock + shared.current_size.store( + if result.is_ok() { new_size } else { old_size }, + Ordering::Release, + ); + + result + } + + /// Returns the current user-visible size of the shared memory segment. + /// + /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's + /// responsibility not to access the area beyond the current size. + pub fn current_size(&self) -> usize { + let total_current_size = + self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS; + total_current_size - HEADER_SIZE + } +} + +impl Drop for ShmemHandle { + fn drop(&mut self) { + // SAFETY: The pointer was obtained from mmap() with the given size. + // We unmap the entire region. + let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) }; + // The fd is dropped automatically by OwnedFd. + } +} + +/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an +/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for +/// development and testing, but in production we want the file to stay in memory. +/// +/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused. +#[allow(unused_variables)] +fn create_backing_file(name: &str) -> Result { + #[cfg(not(target_os = "macos"))] + { + nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty()) + .map_err(|e| Error::new("memfd_create failed: {e}", e)) + } + #[cfg(target_os = "macos")] + { + let file = tempfile::tempfile().map_err(|e| { + Error::new( + "could not create temporary file to back shmem area: {e}", + nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)), + ) + })?; + Ok(OwnedFd::from(file)) + } +} + +fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> { + // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that + // we don't get a segfault later when trying to actually use it. + #[cfg(not(target_os = "macos"))] + { + nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| { + Error::new( + "could not grow shmem segment, posix_fallocate failed: {e}", + e, + ) + }) + } + // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate' + #[cfg(target_os = "macos")] + { + nix::unistd::ftruncate(fd, size as i64) + .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use nix::unistd::ForkResult; + use std::ops::Range; + + /// check that all bytes in given range have the expected value. + fn assert_range(ptr: *const u8, expected: u8, range: Range) { + for i in range { + let b = unsafe { *(ptr.add(i)) }; + assert_eq!(expected, b, "unexpected byte at offset {}", i); + } + } + + /// Write 'b' to all bytes in the given range + fn write_range(ptr: *mut u8, b: u8, range: Range) { + unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) }; + } + + // simple single-process test of growing and shrinking + #[test] + fn test_shmem_resize() -> Result<(), Error> { + let max_size = 1024 * 1024; + let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?; + + assert_eq!(init_struct.current_size(), 0); + + // Initial grow + let size1 = 10000; + init_struct.set_size(size1).unwrap(); + assert_eq!(init_struct.current_size(), size1); + + // Write some data + let data_ptr = init_struct.data_ptr.as_ptr(); + write_range(data_ptr, 0xAA, 0..size1); + assert_range(data_ptr, 0xAA, 0..size1); + + // Shrink + let size2 = 5000; + init_struct.set_size(size2).unwrap(); + assert_eq!(init_struct.current_size(), size2); + + // Grow again + let size3 = 20000; + init_struct.set_size(size3).unwrap(); + assert_eq!(init_struct.current_size(), size3); + + // Try to read it. The area that was shrunk and grown again should read as all zeros now + assert_range(data_ptr, 0xAA, 0..5000); + assert_range(data_ptr, 0, 5000..size1); + + // Try to grow beyond max_size + //let size4 = max_size + 1; + //assert!(init_struct.set_size(size4).is_err()); + + // Dropping init_struct should unmap the memory + drop(init_struct); + + Ok(()) + } + + /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier, + /// but is stored in the shared memory area and works across processes. It's implemented by + /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes. + struct SimpleBarrier { + num_procs: usize, + count: AtomicUsize, + } + + impl SimpleBarrier { + unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) { + unsafe { + *ptr = SimpleBarrier { + num_procs, + count: AtomicUsize::new(0), + } + } + } + + pub fn wait(&self) { + let old = self.count.fetch_add(1, Ordering::Relaxed); + + let generation = old / self.num_procs; + + let mut current = old + 1; + while current < (generation + 1) * self.num_procs { + std::thread::sleep(std::time::Duration::from_millis(10)); + current = self.count.load(Ordering::Relaxed); + } + } + } + + #[test] + fn test_multi_process() { + // Initialize + let max_size = 1_000_000_000_000; + let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap(); + let ptr = init_struct.data_ptr.as_ptr(); + + // Store the SimpleBarrier in the first 1k of the area. + init_struct.set_size(10000).unwrap(); + let barrier_ptr: *mut SimpleBarrier = unsafe { + ptr.add(ptr.align_offset(std::mem::align_of::())) + .cast() + }; + unsafe { SimpleBarrier::init(barrier_ptr, 2) }; + let barrier = unsafe { barrier_ptr.as_ref().unwrap() }; + + // Fork another test process. The code after this runs in both processes concurrently. + let fork_result = unsafe { nix::unistd::fork().unwrap() }; + + // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000 + if fork_result.is_parent() { + write_range(ptr, 0xAA, 1000..2000); + } else { + write_range(ptr, 0xBB, 2000..3000); + } + barrier.wait(); + // Verify the contents. (in both processes) + assert_range(ptr, 0xAA, 1000..2000); + assert_range(ptr, 0xBB, 2000..3000); + + // Grow, from the child this time + let size = 10_000_000; + if !fork_result.is_parent() { + init_struct.set_size(size).unwrap(); + } + barrier.wait(); + + // make some writes at the end + if fork_result.is_parent() { + write_range(ptr, 0xAA, (size - 10)..size); + } else { + write_range(ptr, 0xBB, (size - 20)..(size - 10)); + } + barrier.wait(); + + // Verify the contents. (This runs in both processes) + assert_range(ptr, 0, (size - 1000)..(size - 20)); + assert_range(ptr, 0xBB, (size - 20)..(size - 10)); + assert_range(ptr, 0xAA, (size - 10)..size); + + if let ForkResult::Parent { child } = fork_result { + nix::sys::wait::waitpid(child, None).unwrap(); + } + } +} diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index b64c42a808..f2ba50a86f 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -182,6 +182,7 @@ pub struct ConfigToml { pub tracing: Option, pub enable_tls_page_service_api: bool, pub dev_mode: bool, + pub timeline_import_config: TimelineImportConfig, } #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -300,6 +301,13 @@ impl From for tracing_utils::Protocol { } } +#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] +pub struct TimelineImportConfig { + pub import_job_concurrency: NonZeroUsize, + pub import_job_soft_size_limit: NonZeroUsize, + pub import_job_checkpoint_threshold: NonZeroUsize, +} + pub mod statvfs { pub mod mock { #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)] @@ -632,23 +640,15 @@ impl Default for ConfigToml { tenant_config: TenantConfigToml::default(), no_sync: None, wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL, - page_service_pipelining: if !cfg!(test) { - PageServicePipeliningConfig::Serial - } else { - // Do not turn this into the default until scattered reads have been - // validated and rolled-out fully. - PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined { + page_service_pipelining: PageServicePipeliningConfig::Pipelined( + PageServicePipeliningConfigPipelined { max_batch_size: NonZeroUsize::new(32).unwrap(), execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures, batching: PageServiceProtocolPipelinedBatchingStrategy::ScatteredLsn, - }) - }, - get_vectored_concurrent_io: if !cfg!(test) { - GetVectoredConcurrentIo::Sequential - } else { - GetVectoredConcurrentIo::SidecarTask - }, - enable_read_path_debugging: if cfg!(test) || cfg!(feature = "testing") { + }, + ), + get_vectored_concurrent_io: GetVectoredConcurrentIo::SidecarTask, + enable_read_path_debugging: if cfg!(feature = "testing") { Some(true) } else { None @@ -659,6 +659,11 @@ impl Default for ConfigToml { tracing: None, enable_tls_page_service_api: false, dev_mode: false, + timeline_import_config: TimelineImportConfig { + import_job_concurrency: NonZeroUsize::new(128).unwrap(), + import_job_soft_size_limit: NonZeroUsize::new(1024 * 1024 * 1024).unwrap(), + import_job_checkpoint_threshold: NonZeroUsize::new(128).unwrap(), + }, } } } diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 0c4d7fd4cb..c14975167b 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -910,6 +910,11 @@ impl Key { self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff } + #[inline(always)] + pub fn is_rel_block_of_rel(&self, rel: Oid) -> bool { + self.is_rel_block_key() && self.field4 == rel + } + #[inline(always)] pub fn is_rel_dir_key(&self) -> bool { self.field1 == 0x00 diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index ff911499ab..e9b37c8ca6 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -336,14 +336,30 @@ impl TimelineCreateRequest { #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub enum ShardImportStatus { - InProgress, + InProgress(Option), Done, Error(String), } + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub enum ShardImportProgress { + V1(ShardImportProgressV1), +} + +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +pub struct ShardImportProgressV1 { + /// Total number of jobs in the import plan + pub jobs: usize, + /// Number of jobs completed + pub completed: usize, + /// Hash of the plan + pub import_plan_hash: u64, +} + impl ShardImportStatus { pub fn is_terminal(&self) -> bool { match self { - ShardImportStatus::InProgress => false, + ShardImportStatus::InProgress(_) => false, ShardImportStatus::Done | ShardImportStatus::Error(_) => true, } } @@ -1803,7 +1819,6 @@ pub struct TopTenantShardsResponse { } pub mod virtual_file { - use std::sync::LazyLock; #[derive( Copy, @@ -1832,6 +1847,7 @@ pub mod virtual_file { Eq, Hash, strum_macros::EnumString, + strum_macros::EnumIter, strum_macros::Display, serde_with::DeserializeFromStr, serde_with::SerializeDisplay, @@ -1843,37 +1859,14 @@ pub mod virtual_file { /// Uses buffered IO. Buffered, /// Uses direct IO for reads only. - #[cfg(target_os = "linux")] Direct, /// Use direct IO for reads and writes. - #[cfg(target_os = "linux")] DirectRw, } impl IoMode { pub fn preferred() -> Self { - // The default behavior when running Rust unit tests without any further - // flags is to use the newest behavior (DirectRw). - // The CI uses the following environment variable to unit tests for all - // different modes. - // NB: the Python regression & perf tests have their own defaults management - // that writes pageserver.toml; they do not use this variable. - if cfg!(test) { - static CACHED: LazyLock = LazyLock::new(|| { - utils::env::var_serde_json_string( - "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE", - ) - .unwrap_or( - #[cfg(target_os = "linux")] - IoMode::DirectRw, - #[cfg(not(target_os = "linux"))] - IoMode::Buffered, - ) - }); - *CACHED - } else { - IoMode::Buffered - } + IoMode::DirectRw } } @@ -1883,9 +1876,7 @@ pub mod virtual_file { fn try_from(value: u8) -> Result { Ok(match value { v if v == (IoMode::Buffered as u8) => IoMode::Buffered, - #[cfg(target_os = "linux")] v if v == (IoMode::Direct as u8) => IoMode::Direct, - #[cfg(target_os = "linux")] v if v == (IoMode::DirectRw as u8) => IoMode::DirectRw, x => return Err(x), }) diff --git a/libs/pageserver_api/src/upcall_api.rs b/libs/pageserver_api/src/upcall_api.rs index 7ee63f9036..4dce5f7817 100644 --- a/libs/pageserver_api/src/upcall_api.rs +++ b/libs/pageserver_api/src/upcall_api.rs @@ -4,6 +4,7 @@ //! See docs/rfcs/025-generation-numbers.md use serde::{Deserialize, Serialize}; +use utils::generation::Generation; use utils::id::{NodeId, TimelineId}; use crate::controller_api::NodeRegisterRequest; @@ -63,9 +64,17 @@ pub struct ValidateResponseTenant { pub valid: bool, } +#[derive(Serialize, Deserialize)] +pub struct TimelineImportStatusRequest { + pub tenant_shard_id: TenantShardId, + pub timeline_id: TimelineId, + pub generation: Generation, +} + #[derive(Serialize, Deserialize)] pub struct PutTimelineImportStatusRequest { pub tenant_shard_id: TenantShardId, pub timeline_id: TimelineId, pub status: ShardImportStatus, + pub generation: Generation, } diff --git a/libs/pageserver_api/src/value.rs b/libs/pageserver_api/src/value.rs index 883d903ff3..e9000939c3 100644 --- a/libs/pageserver_api/src/value.rs +++ b/libs/pageserver_api/src/value.rs @@ -36,6 +36,24 @@ impl Value { Value::WalRecord(rec) => rec.will_init(), } } + + #[inline(always)] + pub fn estimated_size(&self) -> usize { + match self { + Value::Image(image) => image.len(), + Value::WalRecord(NeonWalRecord::AuxFile { + content: Some(content), + .. + }) => content.len(), + Value::WalRecord(NeonWalRecord::Postgres { rec, .. }) => rec.len(), + Value::WalRecord(NeonWalRecord::ClogSetAborted { xids }) => xids.len() * 4, + Value::WalRecord(NeonWalRecord::ClogSetCommitted { xids, .. }) => xids.len() * 4, + Value::WalRecord(NeonWalRecord::MultixactMembersCreate { members, .. }) => { + members.len() * 8 + } + _ => 8192, /* use image size as the estimation */ + } + } } #[derive(Debug, PartialEq)] diff --git a/libs/posthog_client_lite/Cargo.toml b/libs/posthog_client_lite/Cargo.toml new file mode 100644 index 0000000000..7c19bf2ccb --- /dev/null +++ b/libs/posthog_client_lite/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "posthog_client_lite" +version = "0.1.0" +edition = "2024" +license.workspace = true + +[dependencies] +anyhow.workspace = true +reqwest.workspace = true +serde.workspace = true +serde_json.workspace = true +sha2.workspace = true +workspace_hack.workspace = true +thiserror.workspace = true diff --git a/libs/posthog_client_lite/src/lib.rs b/libs/posthog_client_lite/src/lib.rs new file mode 100644 index 0000000000..53deb26ab7 --- /dev/null +++ b/libs/posthog_client_lite/src/lib.rs @@ -0,0 +1,634 @@ +//! A lite version of the PostHog client that only supports local evaluation of feature flags. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; +use serde_json::json; +use sha2::Digest; + +#[derive(Debug, thiserror::Error)] +pub enum PostHogEvaluationError { + /// The feature flag is not available, for example, because the local evaluation data is not populated yet. + #[error("Feature flag not available: {0}")] + NotAvailable(String), + #[error("No condition group is matched")] + NoConditionGroupMatched, + /// Real errors, e.g., the rollout percentage does not add up to 100. + #[error("Failed to evaluate feature flag: {0}")] + Internal(String), +} + +#[derive(Deserialize)] +pub struct LocalEvaluationResponse { + #[allow(dead_code)] + flags: Vec, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlag { + key: String, + filters: LocalEvaluationFlagFilters, + active: bool, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagFilters { + groups: Vec, + multivariate: LocalEvaluationFlagMultivariate, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagFilterGroup { + variant: Option, + properties: Option>, + rollout_percentage: i64, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagFilterProperty { + key: String, + value: PostHogFlagFilterPropertyValue, + operator: String, +} + +#[derive(Debug, Serialize, Deserialize)] +#[serde(untagged)] +pub enum PostHogFlagFilterPropertyValue { + String(String), + Number(f64), + Boolean(bool), + List(Vec), +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagMultivariate { + variants: Vec, +} + +#[derive(Deserialize)] +pub struct LocalEvaluationFlagMultivariateVariant { + key: String, + rollout_percentage: i64, +} + +pub struct FeatureStore { + flags: HashMap, +} + +impl Default for FeatureStore { + fn default() -> Self { + Self::new() + } +} + +enum GroupEvaluationResult { + MatchedAndOverride(String), + MatchedAndEvaluate, + Unmatched, +} + +impl FeatureStore { + pub fn new() -> Self { + Self { + flags: HashMap::new(), + } + } + + pub fn set_flags(&mut self, flags: Vec) { + self.flags.clear(); + for flag in flags { + self.flags.insert(flag.key.clone(), flag); + } + } + + /// Generate a consistent hash for a user ID (e.g., tenant ID). + /// + /// The implementation is different from PostHog SDK. In PostHog SDK, it is sha1 of `user_id.distinct_id.salt`. + /// However, as we do not upload all of our tenant IDs to PostHog, we do not have the PostHog distinct_id for a + /// tenant. Therefore, the way we compute it is sha256 of `user_id.feature_id.salt`. + fn consistent_hash(user_id: &str, flag_key: &str, salt: &str) -> f64 { + let mut hasher = sha2::Sha256::new(); + hasher.update(user_id); + hasher.update("."); + hasher.update(flag_key); + hasher.update("."); + hasher.update(salt); + let hash = hasher.finalize(); + let hash_int = u64::from_le_bytes(hash[..8].try_into().unwrap()); + hash_int as f64 / u64::MAX as f64 + } + + /// Evaluate a condition. Returns an error if the condition cannot be evaluated due to parsing error or missing + /// property. + fn evaluate_condition( + &self, + operator: &str, + provided: &PostHogFlagFilterPropertyValue, + requested: &PostHogFlagFilterPropertyValue, + ) -> Result { + match operator { + "exact" => { + let PostHogFlagFilterPropertyValue::String(provided) = provided else { + // Left should be a string + return Err(PostHogEvaluationError::Internal(format!( + "The left side of the condition is not a string: {:?}", + provided + ))); + }; + let PostHogFlagFilterPropertyValue::List(requested) = requested else { + // Right should be a list of string + return Err(PostHogEvaluationError::Internal(format!( + "The right side of the condition is not a list: {:?}", + requested + ))); + }; + Ok(requested.contains(provided)) + } + "lt" | "gt" => { + let PostHogFlagFilterPropertyValue::String(requested) = requested else { + // Right should be a string + return Err(PostHogEvaluationError::Internal(format!( + "The right side of the condition is not a string: {:?}", + requested + ))); + }; + let Ok(requested) = requested.parse::() else { + return Err(PostHogEvaluationError::Internal(format!( + "Can not parse the right side of the condition as a number: {:?}", + requested + ))); + }; + // Left can either be a number or a string + let provided = match provided { + PostHogFlagFilterPropertyValue::Number(provided) => *provided, + PostHogFlagFilterPropertyValue::String(provided) => { + let Ok(provided) = provided.parse::() else { + return Err(PostHogEvaluationError::Internal(format!( + "Can not parse the left side of the condition as a number: {:?}", + provided + ))); + }; + provided + } + _ => { + return Err(PostHogEvaluationError::Internal(format!( + "The left side of the condition is not a number or a string: {:?}", + provided + ))); + } + }; + match operator { + "lt" => Ok(provided < requested), + "gt" => Ok(provided > requested), + op => Err(PostHogEvaluationError::Internal(format!( + "Unsupported operator: {}", + op + ))), + } + } + _ => Err(PostHogEvaluationError::Internal(format!( + "Unsupported operator: {}", + operator + ))), + } + } + + /// Evaluate a percentage. + fn evaluate_percentage(&self, mapped_user_id: f64, percentage: i64) -> bool { + mapped_user_id <= percentage as f64 / 100.0 + } + + /// Evaluate a filter group for a feature flag. Returns an error if there are errors during the evaluation. + /// + /// Return values: + /// Ok(GroupEvaluationResult::MatchedAndOverride(variant)): matched and evaluated to this value + /// Ok(GroupEvaluationResult::MatchedAndEvaluate): condition matched but no variant override, use the global rollout percentage + /// Ok(GroupEvaluationResult::Unmatched): condition unmatched + fn evaluate_group( + &self, + group: &LocalEvaluationFlagFilterGroup, + hash_on_group_rollout_percentage: f64, + provided_properties: &HashMap, + ) -> Result { + if let Some(ref properties) = group.properties { + for property in properties { + if let Some(value) = provided_properties.get(&property.key) { + // The user provided the property value + if !self.evaluate_condition( + property.operator.as_ref(), + value, + &property.value, + )? { + return Ok(GroupEvaluationResult::Unmatched); + } + } else { + // We cannot evaluate, the property is not available + return Err(PostHogEvaluationError::NotAvailable(format!( + "The required property in the condition is not available: {}", + property.key + ))); + } + } + } + + // The group has no condition matchers or we matched the properties + if self.evaluate_percentage(hash_on_group_rollout_percentage, group.rollout_percentage) { + if let Some(ref variant_override) = group.variant { + Ok(GroupEvaluationResult::MatchedAndOverride( + variant_override.clone(), + )) + } else { + Ok(GroupEvaluationResult::MatchedAndEvaluate) + } + } else { + Ok(GroupEvaluationResult::Unmatched) + } + } + + /// Evaluate a multivariate feature flag. Returns `None` if the flag is not available or if there are errors + /// during the evaluation. + /// + /// The parsing logic is as follows: + /// + /// * Match each filter group. + /// - If a group is matched, it will first determine whether the user is in the range of the group's rollout + /// percentage. We will generate a consistent hash for the user ID on the group rollout percentage. This hash + /// is shared across all groups. + /// - If the hash falls within the group's rollout percentage, return the variant if it's overridden, or + /// - Evaluate the variant using the global config and the global rollout percentage. + /// * Otherwise, continue with the next group until all groups are evaluated and no group is within the + /// rollout percentage. + /// * If there are no matching groups, return an error. + /// + /// Example: we have a multivariate flag with 3 groups of the configured global rollout percentage: A (10%), B (20%), C (70%). + /// There is a single group with a condition that has a rollout percentage of 10% and it does not have a variant override. + /// Then, we will have 1% of the users evaluated to A, 2% to B, and 7% to C. + pub fn evaluate_multivariate( + &self, + flag_key: &str, + user_id: &str, + ) -> Result { + let hash_on_global_rollout_percentage = + Self::consistent_hash(user_id, flag_key, "multivariate"); + let hash_on_group_rollout_percentage = + Self::consistent_hash(user_id, flag_key, "within_group"); + self.evaluate_multivariate_inner( + flag_key, + hash_on_global_rollout_percentage, + hash_on_group_rollout_percentage, + &HashMap::new(), + ) + } + + /// Evaluate a multivariate feature flag. Note that we directly take the mapped user ID + /// (a consistent hash ranging from 0 to 1) so that it is easier to use it in the tests + /// and avoid duplicate computations. + /// + /// Use a different consistent hash for evaluating the group rollout percentage. + /// The behavior: if the condition is set to rolling out to 10% of the users, and + /// we set the variant A to 20% in the global config, then 2% of the total users will + /// be evaluated to variant A. + /// + /// Note that the hash to determine group rollout percentage is shared across all groups. So if we have two + /// exactly-the-same conditions with 10% and 20% rollout percentage respectively, a total of 20% of the users + /// will be evaluated (versus 30% if group evaluation is done independently). + pub(crate) fn evaluate_multivariate_inner( + &self, + flag_key: &str, + hash_on_global_rollout_percentage: f64, + hash_on_group_rollout_percentage: f64, + properties: &HashMap, + ) -> Result { + if let Some(flag_config) = self.flags.get(flag_key) { + if !flag_config.active { + return Err(PostHogEvaluationError::NotAvailable(format!( + "The feature flag is not active: {}", + flag_key + ))); + } + // TODO: sort the groups so that variant overrides always get evaluated first and it follows the PostHog + // Python SDK behavior; for now we do not configure conditions without variant overrides in Neon so it + // does not matter. + for group in &flag_config.filters.groups { + match self.evaluate_group(group, hash_on_group_rollout_percentage, properties)? { + GroupEvaluationResult::MatchedAndOverride(variant) => return Ok(variant), + GroupEvaluationResult::MatchedAndEvaluate => { + let mut percentage = 0; + for variant in &flag_config.filters.multivariate.variants { + percentage += variant.rollout_percentage; + if self + .evaluate_percentage(hash_on_global_rollout_percentage, percentage) + { + return Ok(variant.key.clone()); + } + } + // This should not happen because the rollout percentage always adds up to 100, but just in case that PostHog + // returned invalid spec, we return an error. + return Err(PostHogEvaluationError::Internal(format!( + "Rollout percentage does not add up to 100: {}", + flag_key + ))); + } + GroupEvaluationResult::Unmatched => continue, + } + } + // If no group is matched, the feature is not available, and up to the caller to decide what to do. + Err(PostHogEvaluationError::NoConditionGroupMatched) + } else { + // The feature flag is not available yet + Err(PostHogEvaluationError::NotAvailable(format!( + "Not found in the local evaluation spec: {}", + flag_key + ))) + } + } +} + +/// A lite PostHog client. +/// +/// At the point of writing this code, PostHog does not have a functional Rust client with feature flag support. +/// This is a lite version that only supports local evaluation of feature flags and only supports those JSON specs +/// that will be used within Neon. +/// +/// PostHog is designed as a browser-server system: the browser (client) side uses the client key and is exposed +/// to the end users; the server side uses a server key and is not exposed to the end users. The client and the +/// server has different API keys and provide a different set of APIs. In Neon, we only have the server (that is +/// pageserver), and it will use both the client API and the server API. So we need to store two API keys within +/// our PostHog client. +/// +/// The server API is used to fetch the feature flag specs. The client API is used to capture events in case we +/// want to report the feature flag usage back to PostHog. The current plan is to use PostHog only as an UI to +/// configure feature flags so it is very likely that the client API will not be used. +pub struct PostHogClient { + /// The server API key. + server_api_key: String, + /// The client API key. + client_api_key: String, + /// The project ID. + project_id: String, + /// The private API URL. + private_api_url: String, + /// The public API URL. + public_api_url: String, + /// The HTTP client. + client: reqwest::Client, +} + +impl PostHogClient { + pub fn new( + server_api_key: String, + client_api_key: String, + project_id: String, + private_api_url: String, + public_api_url: String, + ) -> Self { + let client = reqwest::Client::new(); + Self { + server_api_key, + client_api_key, + project_id, + private_api_url, + public_api_url, + client, + } + } + + pub fn new_with_us_region( + server_api_key: String, + client_api_key: String, + project_id: String, + ) -> Self { + Self::new( + server_api_key, + client_api_key, + project_id, + "https://us.posthog.com".to_string(), + "https://us.i.posthog.com".to_string(), + ) + } + + /// Fetch the feature flag specs from the server. + /// + /// This is unfortunately an undocumented API at: + /// - + /// - + /// + /// The handling logic in [`FeatureStore`] mostly follows the Python API implementation. + /// See `_compute_flag_locally` in + pub async fn get_feature_flags_local_evaluation( + &self, + ) -> anyhow::Result { + // BASE_URL/api/projects/:project_id/feature_flags/local_evaluation + // with bearer token of self.server_api_key + let url = format!( + "{}/api/projects/{}/feature_flags/local_evaluation", + self.private_api_url, self.project_id + ); + let response = self + .client + .get(url) + .bearer_auth(&self.server_api_key) + .send() + .await?; + let body = response.text().await?; + Ok(serde_json::from_str(&body)?) + } + + /// Capture an event. This will only be used to report the feature flag usage back to PostHog, though + /// it also support a lot of other functionalities. + /// + /// + pub async fn capture_event( + &self, + event: &str, + distinct_id: &str, + properties: &HashMap, + ) -> anyhow::Result<()> { + // PUBLIC_URL/capture/ + // with bearer token of self.client_api_key + let url = format!("{}/capture/", self.public_api_url); + self.client + .post(url) + .body(serde_json::to_string(&json!({ + "api_key": self.client_api_key, + "distinct_id": distinct_id, + "event": event, + "properties": properties, + }))?) + .send() + .await?; + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn data() -> &'static str { + r#"{ + "flags": [ + { + "id": 132794, + "team_id": 152860, + "name": "", + "key": "gc-compaction", + "filters": { + "groups": [ + { + "variant": "enabled-stage-2", + "properties": [ + { + "key": "plan_type", + "type": "person", + "value": [ + "free" + ], + "operator": "exact" + }, + { + "key": "pageserver_remote_size", + "type": "person", + "value": "10000000", + "operator": "lt" + } + ], + "rollout_percentage": 50 + }, + { + "properties": [ + { + "key": "plan_type", + "type": "person", + "value": [ + "free" + ], + "operator": "exact" + }, + { + "key": "pageserver_remote_size", + "type": "person", + "value": "10000000", + "operator": "lt" + } + ], + "rollout_percentage": 80 + } + ], + "payloads": {}, + "multivariate": { + "variants": [ + { + "key": "disabled", + "name": "", + "rollout_percentage": 90 + }, + { + "key": "enabled-stage-1", + "name": "", + "rollout_percentage": 10 + }, + { + "key": "enabled-stage-2", + "name": "", + "rollout_percentage": 0 + }, + { + "key": "enabled-stage-3", + "name": "", + "rollout_percentage": 0 + }, + { + "key": "enabled", + "name": "", + "rollout_percentage": 0 + } + ] + } + }, + "deleted": false, + "active": true, + "ensure_experience_continuity": false, + "has_encrypted_payloads": false, + "version": 6 + } + ], + "group_type_mapping": {}, + "cohorts": {} + }"# + } + + #[test] + fn parse_local_evaluation() { + let data = data(); + let _: LocalEvaluationResponse = serde_json::from_str(data).unwrap(); + } + + #[test] + fn evaluate_multivariate() { + let mut store = FeatureStore::new(); + let response: LocalEvaluationResponse = serde_json::from_str(data()).unwrap(); + store.set_flags(response.flags); + + // This lacks the required properties and cannot be evaluated. + let variant = + store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.40, &HashMap::new()); + assert!(matches!( + variant, + Err(PostHogEvaluationError::NotAvailable(_)) + ),); + + let properties_unmatched = HashMap::from([ + ( + "plan_type".to_string(), + PostHogFlagFilterPropertyValue::String("paid".to_string()), + ), + ( + "pageserver_remote_size".to_string(), + PostHogFlagFilterPropertyValue::Number(1000.0), + ), + ]); + + // This does not match any group so there will be an error. + let variant = + store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.40, &properties_unmatched); + assert!(matches!( + variant, + Err(PostHogEvaluationError::NoConditionGroupMatched) + ),); + let variant = + store.evaluate_multivariate_inner("gc-compaction", 0.80, 0.80, &properties_unmatched); + assert!(matches!( + variant, + Err(PostHogEvaluationError::NoConditionGroupMatched) + ),); + + let properties = HashMap::from([ + ( + "plan_type".to_string(), + PostHogFlagFilterPropertyValue::String("free".to_string()), + ), + ( + "pageserver_remote_size".to_string(), + PostHogFlagFilterPropertyValue::Number(1000.0), + ), + ]); + + // It matches the first group as 0.10 <= 0.50 and the properties are matched. Then it gets evaluated to the variant override. + let variant = store.evaluate_multivariate_inner("gc-compaction", 0.10, 0.10, &properties); + assert_eq!(variant.unwrap(), "enabled-stage-2".to_string()); + + // It matches the second group as 0.50 <= 0.60 <= 0.80 and the properties are matched. Then it gets evaluated using the global percentage. + let variant = store.evaluate_multivariate_inner("gc-compaction", 0.99, 0.60, &properties); + assert_eq!(variant.unwrap(), "enabled-stage-1".to_string()); + let variant = store.evaluate_multivariate_inner("gc-compaction", 0.80, 0.60, &properties); + assert_eq!(variant.unwrap(), "disabled".to_string()); + + // It matches the group conditions but not the group rollout percentage. + let variant = store.evaluate_multivariate_inner("gc-compaction", 1.00, 0.90, &properties); + assert!(matches!( + variant, + Err(PostHogEvaluationError::NoConditionGroupMatched) + ),); + } +} diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index a5cddb840f..5363e935e3 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -330,11 +330,18 @@ impl AzureBlobStorage { if let Err(DownloadError::Timeout) = &next_item { timeout_try_cnt += 1; if timeout_try_cnt <= 5 { - continue; + continue 'outer; } } - let next_item = next_item?; + let next_item = match next_item { + Ok(next_item) => next_item, + Err(e) => { + // The error is potentially retryable, so we must rewind the loop after yielding. + yield Err(e); + continue 'outer; + }, + }; // Log a warning if we saw two timeouts in a row before a successful request if timeout_try_cnt > 2 { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 918d9d5a6b..d98ff552ee 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -657,7 +657,14 @@ impl RemoteStorage for S3Bucket { res = request => Ok(res), _ = tokio::time::sleep(self.timeout) => Err(DownloadError::Timeout), _ = cancel.cancelled() => Err(DownloadError::Cancelled), - }?; + }; + + if let Err(DownloadError::Timeout) = &response { + yield Err(DownloadError::Timeout); + continue 'outer; + } + + let response = response?; // always yield cancellation errors and stop the stream let response = response .context("Failed to list S3 prefixes") diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs index cc31b38fe7..8658dc4011 100644 --- a/libs/safekeeper_api/src/models.rs +++ b/libs/safekeeper_api/src/models.rs @@ -299,6 +299,7 @@ pub struct PullTimelineRequest { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub http_hosts: Vec, + pub ignore_tombstone: Option, } #[derive(Debug, Serialize, Deserialize)] diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs index 215fa36df4..45acaf682f 100644 --- a/libs/utils/src/crashsafe.rs +++ b/libs/utils/src/crashsafe.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::fs::{self, File}; use std::io::{self, Write}; -use std::os::fd::AsRawFd; +use std::os::fd::AsFd; use camino::{Utf8Path, Utf8PathBuf}; @@ -210,13 +210,13 @@ pub fn overwrite( /// Syncs the filesystem for the given file descriptor. #[cfg_attr(target_os = "macos", allow(unused_variables))] -pub fn syncfs(fd: impl AsRawFd) -> anyhow::Result<()> { +pub fn syncfs(fd: impl AsFd) -> anyhow::Result<()> { // Linux guarantees durability for syncfs. // POSIX doesn't have syncfs, and further does not actually guarantee durability of sync(). #[cfg(target_os = "linux")] { use anyhow::Context; - nix::unistd::syncfs(fd.as_raw_fd()).context("syncfs")?; + nix::unistd::syncfs(fd).context("syncfs")?; } #[cfg(target_os = "macos")] { diff --git a/libs/utils/src/fs_ext/rename_noreplace.rs b/libs/utils/src/fs_ext/rename_noreplace.rs index d0c07353d0..c945ecadf0 100644 --- a/libs/utils/src/fs_ext/rename_noreplace.rs +++ b/libs/utils/src/fs_ext/rename_noreplace.rs @@ -11,9 +11,9 @@ pub fn rename_noreplace( #[cfg(all(target_os = "linux", target_env = "gnu"))] { nix::fcntl::renameat2( - None, + nix::fcntl::AT_FDCWD, src, - None, + nix::fcntl::AT_FDCWD, dst, nix::fcntl::RenameFlags::RENAME_NOREPLACE, ) diff --git a/libs/utils/src/lock_file.rs b/libs/utils/src/lock_file.rs index 6aeeeca021..b3c8d74d7d 100644 --- a/libs/utils/src/lock_file.rs +++ b/libs/utils/src/lock_file.rs @@ -1,6 +1,6 @@ //! A module to create and read lock files. //! -//! File locking is done using [`fcntl::flock`] exclusive locks. +//! File locking is done using [`nix::fcntl::Flock`] exclusive locks. //! The only consumer of this module is currently //! [`pid_file`](crate::pid_file). See the module-level comment //! there for potential pitfalls with lock files that are used @@ -9,26 +9,25 @@ use std::fs; use std::io::{Read, Write}; use std::ops::Deref; -use std::os::unix::prelude::AsRawFd; use anyhow::Context; use camino::{Utf8Path, Utf8PathBuf}; use nix::errno::Errno::EAGAIN; -use nix::fcntl; +use nix::fcntl::{Flock, FlockArg}; use crate::crashsafe; -/// A handle to an open and unlocked, but not-yet-written lock file. +/// A handle to an open and flocked, but not-yet-written lock file. /// Returned by [`create_exclusive`]. #[must_use] pub struct UnwrittenLockFile { path: Utf8PathBuf, - file: fs::File, + file: Flock, } /// Returned by [`UnwrittenLockFile::write_content`]. #[must_use] -pub struct LockFileGuard(fs::File); +pub struct LockFileGuard(Flock); impl Deref for LockFileGuard { type Target = fs::File; @@ -67,17 +66,14 @@ pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result Ok(UnwrittenLockFile { + Ok(lock_file) => Ok(UnwrittenLockFile { path: lock_file_path.to_owned(), file: lock_file, }), - Err(EAGAIN) => anyhow::bail!("file is already locked"), - Err(e) => Err(e).context("flock error"), + Err((_, EAGAIN)) => anyhow::bail!("file is already locked"), + Err((_, e)) => Err(e).context("flock error"), } } @@ -105,32 +101,37 @@ pub enum LockFileRead { /// Check the [`LockFileRead`] variants for details. pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result { let res = fs::OpenOptions::new().read(true).open(path); - let mut lock_file = match res { + let lock_file = match res { Ok(f) => f, Err(e) => match e.kind() { std::io::ErrorKind::NotFound => return Ok(LockFileRead::NotExist), _ => return Err(e).context("open lock file"), }, }; - let res = fcntl::flock( - lock_file.as_raw_fd(), - fcntl::FlockArg::LockExclusiveNonblock, - ); + let res = Flock::lock(lock_file, FlockArg::LockExclusiveNonblock); // We need the content regardless of lock success / failure. // But, read it after flock so that, if it succeeded, the content is consistent. - let mut content = String::new(); - lock_file - .read_to_string(&mut content) - .context("read lock file")?; match res { - Ok(()) => Ok(LockFileRead::NotHeldByAnyProcess( - LockFileGuard(lock_file), - content, - )), - Err(EAGAIN) => Ok(LockFileRead::LockedByOtherProcess { - not_locked_file: lock_file, - content, - }), - Err(e) => Err(e).context("flock error"), + Ok(mut locked_file) => { + let mut content = String::new(); + locked_file + .read_to_string(&mut content) + .context("read lock file")?; + Ok(LockFileRead::NotHeldByAnyProcess( + LockFileGuard(locked_file), + content, + )) + } + Err((mut not_locked_file, EAGAIN)) => { + let mut content = String::new(); + not_locked_file + .read_to_string(&mut content) + .context("read lock file")?; + Ok(LockFileRead::LockedByOtherProcess { + not_locked_file, + content, + }) + } + Err((_, e)) => Err(e).context("flock error"), } } diff --git a/libs/utils/src/rate_limit.rs b/libs/utils/src/rate_limit.rs index 945f710b1d..700cd5792b 100644 --- a/libs/utils/src/rate_limit.rs +++ b/libs/utils/src/rate_limit.rs @@ -17,7 +17,7 @@ impl std::fmt::Display for RateLimitStats { } impl RateLimit { - pub fn new(interval: Duration) -> Self { + pub const fn new(interval: Duration) -> Self { Self { last: None, interval, diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs index 3d15e08400..857d98b644 100644 --- a/libs/utils/src/tracing_span_assert.rs +++ b/libs/utils/src/tracing_span_assert.rs @@ -127,12 +127,12 @@ macro_rules! __check_fields_present { match check_fields_present0($extractors) { Ok(FoundEverything) => Ok(()), - Ok(Unconfigured) if cfg!(test) => { + Ok(Unconfigured) if cfg!(feature = "testing") => { // allow unconfigured in tests Ok(()) }, Ok(Unconfigured) => { - panic!("utils::tracing_span_assert: outside of #[cfg(test)] expected tracing to be configured with tracing_error::ErrorLayer") + panic!(r#"utils::tracing_span_assert: outside of #[cfg(feature = "testing")] expected tracing to be configured with tracing_error::ErrorLayer"#) }, Err(missing) => Err(missing) } diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 8abd504922..b7b3e0eaf1 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -96,6 +96,7 @@ strum.workspace = true strum_macros.workspace = true wal_decoder.workspace = true smallvec.workspace = true +twox-hash.workspace = true [target.'cfg(target_os = "linux")'.dependencies] procfs.workspace = true diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index 2836450a0e..eaadfe14ae 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -14,6 +14,7 @@ use pageserver_api::key::Key; use pageserver_api::models::virtual_file::IoMode; use pageserver_api::shard::TenantShardId; use pageserver_api::value::Value; +use strum::IntoEnumIterator; use tokio_util::sync::CancellationToken; use utils::bin_ser::BeSer; use utils::id::{TenantId, TimelineId}; @@ -244,13 +245,7 @@ fn criterion_benchmark(c: &mut Criterion) { ]; let exploded_parameters = { let mut out = Vec::new(); - for io_mode in [ - IoMode::Buffered, - #[cfg(target_os = "linux")] - IoMode::Direct, - #[cfg(target_os = "linux")] - IoMode::DirectRw, - ] { + for io_mode in IoMode::iter() { for param in expect.clone() { let HandPickedParameters { volume_mib, diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 4a87a91910..219e63c9d4 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -1,5 +1,6 @@ use std::collections::HashMap; use std::error::Error as _; +use std::time::Duration; use bytes::Bytes; use detach_ancestor::AncestorDetached; @@ -819,4 +820,25 @@ impl Client { .await .map(|resp| resp.status()) } + + pub async fn activate_post_import( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + activate_timeline_timeout: Duration, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/activate_post_import?timeline_activate_timeout_ms={}", + self.mgmt_api_endpoint, + tenant_shard_id, + timeline_id, + activate_timeline_timeout.as_millis() + ); + + self.request(Method::PUT, uri, ()) + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } } diff --git a/pageserver/page_api/Cargo.toml b/pageserver/page_api/Cargo.toml new file mode 100644 index 0000000000..c237949226 --- /dev/null +++ b/pageserver/page_api/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "pageserver_page_api" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[dependencies] +prost.workspace = true +tonic.workspace = true +workspace_hack.workspace = true + +[build-dependencies] +tonic-build.workspace = true diff --git a/pageserver/page_api/build.rs b/pageserver/page_api/build.rs new file mode 100644 index 0000000000..e96297f10e --- /dev/null +++ b/pageserver/page_api/build.rs @@ -0,0 +1,13 @@ +use std::env; +use std::path::PathBuf; + +/// Generates Rust code from .proto Protobuf schemas, along with a binary file +/// descriptor set for Protobuf schema reflection. +fn main() -> Result<(), Box> { + let out_dir = PathBuf::from(env::var("OUT_DIR")?); + tonic_build::configure() + .bytes(["."]) + .file_descriptor_set_path(out_dir.join("page_api_descriptor.bin")) + .compile_protos(&["proto/page_service.proto"], &["proto"]) + .map_err(|err| err.into()) +} diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto new file mode 100644 index 0000000000..f6acb3eeeb --- /dev/null +++ b/pageserver/page_api/proto/page_service.proto @@ -0,0 +1,233 @@ +// Page service, presented by pageservers for computes. +// +// This is the compute read path. It primarily serves page versions at given +// LSNs, but also base backups, SLRU segments, and relation metadata. +// +// EXPERIMENTAL: this is still under development and subject to change. +// +// Request metadata headers: +// - authorization: JWT token ("Bearer "), if auth is enabled +// - neon-tenant-id: tenant ID ("7c4a1f9e3bd6470c8f3e21a65bd2e980") +// - neon-shard-id: shard ID, as in hex ("0b10" = shard 11 of 16, 0-based) +// - neon-timeline-id: timeline ID ("f08c4e9a2d5f76b1e3a7c2d8910f4b3e") +// +// The service can be accessed via e.g. grpcurl: +// +// ``` +// grpcurl \ +// -plaintext \ +// -H "neon-tenant-id: 7c4a1f9e3bd6470c8f3e21a65bd2e980" \ +// -H "neon-shard-id: 0b10" \ +// -H "neon-timeline-id: f08c4e9a2d5f76b1e3a7c2d8910f4b3e" \ +// -H "authorization: Bearer $JWT" \ +// -d '{"read_lsn": {"request_lsn": 1234567890}, "rel": {"spc_oid": 1663, "db_oid": 1234, "rel_number": 5678, "fork_number": 0}}' +// localhost:51051 page_api.PageService/CheckRelExists +// ``` +// +// TODO: consider adding neon-compute-mode ("primary", "static", "replica"). +// However, this will require reconnecting when changing modes. +// +// TODO: write implementation guidance on +// - Health checks +// - Tracing, OpenTelemetry +// - Compression + +syntax = "proto3"; +package page_api; + +service PageService { + // Returns whether a relation exists. + rpc CheckRelExists(CheckRelExistsRequest) returns (CheckRelExistsResponse); + + // Fetches a base backup. + rpc GetBaseBackup (GetBaseBackupRequest) returns (stream GetBaseBackupResponseChunk); + + // Returns the total size of a database, as # of bytes. + rpc GetDbSize (GetDbSizeRequest) returns (GetDbSizeResponse); + + // Fetches pages. + // + // This is implemented as a bidirectional streaming RPC for performance. Unary + // requests incur costs for e.g. HTTP/2 stream setup, header parsing, + // authentication, and so on -- with streaming, we only pay these costs during + // the initial stream setup. This ~doubles throughput in benchmarks. Other + // RPCs use regular unary requests, since they are not as frequent and + // performance-critical, and this simplifies implementation. + // + // NB: a status response (e.g. errors) will terminate the stream. The stream + // may be shared by e.g. multiple Postgres backends, so we should avoid this. + // Most errors are therefore sent as GetPageResponse.status instead. + rpc GetPages (stream GetPageRequest) returns (stream GetPageResponse); + + // Returns the size of a relation, as # of blocks. + rpc GetRelSize (GetRelSizeRequest) returns (GetRelSizeResponse); + + // Fetches an SLRU segment. + rpc GetSlruSegment (GetSlruSegmentRequest) returns (GetSlruSegmentResponse); +} + +// The LSN a request should read at. +message ReadLsn { + // The request's read LSN. Required. + uint64 request_lsn = 1; + // If given, the caller guarantees that the page has not been modified since + // this LSN. Must be smaller than or equal to request_lsn. This allows the + // Pageserver to serve an old page without waiting for the request LSN to + // arrive. Valid for all request types. + // + // It is undefined behaviour to make a request such that the page was, in + // fact, modified between request_lsn and not_modified_since_lsn. The + // Pageserver might detect it and return an error, or it might return the old + // page version or the new page version. Setting not_modified_since_lsn equal + // to request_lsn is always safe, but can lead to unnecessary waiting. + uint64 not_modified_since_lsn = 2; +} + +// A relation identifier. +message RelTag { + uint32 spc_oid = 1; + uint32 db_oid = 2; + uint32 rel_number = 3; + uint32 fork_number = 4; +} + +// Checks whether a relation exists, at the given LSN. Only valid on shard 0, +// other shards will error. +message CheckRelExistsRequest { + ReadLsn read_lsn = 1; + RelTag rel = 2; +} + +message CheckRelExistsResponse { + bool exists = 1; +} + +// Requests a base backup at a given LSN. +message GetBaseBackupRequest { + // The LSN to fetch a base backup at. + ReadLsn read_lsn = 1; + // If true, logical replication slots will not be created. + bool replica = 2; +} + +// Base backup response chunk, returned as an ordered stream. +message GetBaseBackupResponseChunk { + // A basebackup data chunk. The size is undefined, but bounded by the 4 MB + // gRPC message size limit. + bytes chunk = 1; +} + +// Requests the size of a database, as # of bytes. Only valid on shard 0, other +// shards will error. +message GetDbSizeRequest { + ReadLsn read_lsn = 1; + uint32 db_oid = 2; +} + +message GetDbSizeResponse { + uint64 num_bytes = 1; +} + +// Requests one or more pages. +message GetPageRequest { + // A request ID. Will be included in the response. Should be unique for + // in-flight requests on the stream. + uint64 request_id = 1; + // The request class. + GetPageClass request_class = 2; + // The LSN to read at. + ReadLsn read_lsn = 3; + // The relation to read from. + RelTag rel = 4; + // Page numbers to read. Must belong to the remote shard. + // + // Multiple pages will be executed as a single batch by the Pageserver, + // amortizing layer access costs and parallelizing them. This may increase the + // latency of any individual request, but improves the overall latency and + // throughput of the batch as a whole. + // + // TODO: this causes an allocation in the common single-block case. The sender + // can use a SmallVec to stack-allocate it, but Prost will always deserialize + // into a heap-allocated Vec. Consider optimizing this. + // + // TODO: we might be able to avoid a sort or something if we mandate that these + // are always in order. But we can't currenly rely on this on the server, because + // of compatibility with the libpq protocol handler. + repeated uint32 block_number = 5; +} + +// A GetPageRequest class. Primarily intended for observability, but may also be +// used for prioritization in the future. +enum GetPageClass { + // Unknown class. For forwards compatibility: used when the client sends a + // class that the server doesn't know about. + GET_PAGE_CLASS_UNKNOWN = 0; + // A normal request. This is the default. + GET_PAGE_CLASS_NORMAL = 1; + // A prefetch request. NB: can only be classified on pg < 18. + GET_PAGE_CLASS_PREFETCH = 2; + // A background request (e.g. vacuum). + GET_PAGE_CLASS_BACKGROUND = 3; +} + +// A GetPage response. +// +// A batch response will contain all of the requested pages. We could eagerly +// emit individual pages as soon as they are ready, but on a readv() Postgres +// holds buffer pool locks on all pages in the batch and we'll only return once +// the entire batch is ready, so no one can make use of the individual pages. +message GetPageResponse { + // The original request's ID. + uint64 request_id = 1; + // The response status code. + GetPageStatus status = 2; + // A string describing the status, if any. + string reason = 3; + // The 8KB page images, in the same order as the request. Empty if status != OK. + repeated bytes page_image = 4; +} + +// A GetPageResponse status code. Since we use a bidirectional stream, we don't +// want to send errors as gRPC statuses, since this would terminate the stream. +enum GetPageStatus { + // Unknown status. For forwards compatibility: used when the server sends a + // status code that the client doesn't know about. + GET_PAGE_STATUS_UNKNOWN = 0; + // The request was successful. + GET_PAGE_STATUS_OK = 1; + // The page did not exist. The tenant/timeline/shard has already been + // validated during stream setup. + GET_PAGE_STATUS_NOT_FOUND = 2; + // The request was invalid. + GET_PAGE_STATUS_INVALID = 3; + // The tenant is rate limited. Slow down and retry later. + GET_PAGE_STATUS_SLOW_DOWN = 4; + // TODO: consider adding a GET_PAGE_STATUS_LAYER_DOWNLOAD in the case of a + // layer download. This could free up the server task to process other + // requests while the layer download is in progress. +} + +// Fetches the size of a relation at a given LSN, as # of blocks. Only valid on +// shard 0, other shards will error. +message GetRelSizeRequest { + ReadLsn read_lsn = 1; + RelTag rel = 2; +} + +message GetRelSizeResponse { + uint32 num_blocks = 1; +} + +// Requests an SLRU segment. Only valid on shard 0, other shards will error. +message GetSlruSegmentRequest { + ReadLsn read_lsn = 1; + uint32 kind = 2; + uint32 segno = 3; +} + +// Returns an SLRU segment. +// +// These are up 32 pages (256 KB), so we can send them as a single response. +message GetSlruSegmentResponse { + bytes segment = 1; +} diff --git a/pageserver/page_api/src/lib.rs b/pageserver/page_api/src/lib.rs new file mode 100644 index 0000000000..0b68d03aaa --- /dev/null +++ b/pageserver/page_api/src/lib.rs @@ -0,0 +1,19 @@ +//! This crate provides the Pageserver's page API. It contains: +//! +//! * proto/page_service.proto: the Protobuf schema for the page API. +//! * proto: auto-generated Protobuf types for gRPC. +//! +//! This crate is used by both the client and the server. Try to keep it slim. + +// Code generated by protobuf. +pub mod proto { + tonic::include_proto!("page_api"); + + /// File descriptor set for Protobuf schema reflection. This allows using + /// e.g. grpcurl with the API. + pub const FILE_DESCRIPTOR_SET: &[u8] = + tonic::include_file_descriptor_set!("page_api_descriptor"); + + pub use page_service_client::PageServiceClient; + pub use page_service_server::{PageService, PageServiceServer}; +} diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs index 771a7cbe5b..50419ec338 100644 --- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs +++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs @@ -65,6 +65,9 @@ pub(crate) struct Args { #[clap(long, default_value = "1")] queue_depth: NonZeroUsize, + #[clap(long)] + only_relnode: Option, + targets: Option>, } @@ -206,7 +209,12 @@ async fn main_impl( for r in partitioning.keys.ranges.iter() { let mut i = r.start; while i != r.end { - if i.is_rel_block_key() { + let mut include = true; + include &= i.is_rel_block_key(); + if let Some(only_relnode) = args.only_relnode { + include &= i.is_rel_block_of_rel(only_relnode); + } + if include { filtered.add_key(i); } i = i.next(); diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index ded2805602..7e773f56b3 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -230,6 +230,8 @@ pub struct PageServerConf { /// such as authentication requirements for HTTP and PostgreSQL APIs. /// This is insecure and should only be used in development environments. pub dev_mode: bool, + + pub timeline_import_config: pageserver_api::config::TimelineImportConfig, } /// Token for authentication to safekeepers @@ -404,6 +406,7 @@ impl PageServerConf { tracing, enable_tls_page_service_api, dev_mode, + timeline_import_config, } = config_toml; let mut conf = PageServerConf { @@ -457,6 +460,7 @@ impl PageServerConf { tracing, enable_tls_page_service_api, dev_mode, + timeline_import_config, // ------------------------------------------------------------ // fields that require additional validation or custom handling diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs index 468e5463b0..dc38ea616c 100644 --- a/pageserver/src/controller_upcall_client.rs +++ b/pageserver/src/controller_upcall_client.rs @@ -7,7 +7,7 @@ use pageserver_api::models::ShardImportStatus; use pageserver_api::shard::TenantShardId; use pageserver_api::upcall_api::{ PutTimelineImportStatusRequest, ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, - ValidateRequest, ValidateRequestTenant, ValidateResponse, + TimelineImportStatusRequest, ValidateRequest, ValidateRequestTenant, ValidateResponse, }; use reqwest::Certificate; use serde::Serialize; @@ -51,8 +51,15 @@ pub trait StorageControllerUpcallApi { &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + generation: Generation, status: ShardImportStatus, ) -> impl Future> + Send; + fn get_timeline_import_status( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + generation: Generation, + ) -> impl Future> + Send; } impl StorageControllerUpcallClient { @@ -97,6 +104,7 @@ impl StorageControllerUpcallClient { &self, url: &url::Url, request: R, + method: reqwest::Method, ) -> Result where R: Serialize, @@ -106,7 +114,7 @@ impl StorageControllerUpcallClient { || async { let response = self .http_client - .post(url.clone()) + .request(method.clone(), url.clone()) .json(&request) .send() .await?; @@ -215,7 +223,9 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { register: register.clone(), }; - let response: ReAttachResponse = self.retry_http_forever(&url, request).await?; + let response: ReAttachResponse = self + .retry_http_forever(&url, request, reqwest::Method::POST) + .await?; tracing::info!( "Received re-attach response with {} tenants (node {}, register: {:?})", response.tenants.len(), @@ -268,7 +278,9 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { return Err(RetryForeverError::ShuttingDown); } - let response: ValidateResponse = self.retry_http_forever(&url, request).await?; + let response: ValidateResponse = self + .retry_http_forever(&url, request, reqwest::Method::POST) + .await?; for rt in response.tenants { result.insert(rt.id, rt.valid); } @@ -287,6 +299,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { &self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, + generation: Generation, status: ShardImportStatus, ) -> Result<(), RetryForeverError> { let url = self @@ -297,9 +310,35 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient { let request = PutTimelineImportStatusRequest { tenant_shard_id, timeline_id, + generation, status, }; - self.retry_http_forever(&url, request).await + self.retry_http_forever(&url, request, reqwest::Method::POST) + .await + } + + #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context + async fn get_timeline_import_status( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + generation: Generation, + ) -> Result { + let url = self + .base_url + .join("timeline_import_status") + .expect("Failed to build path"); + + let request = TimelineImportStatusRequest { + tenant_shard_id, + timeline_id, + generation, + }; + + let response: ShardImportStatus = self + .retry_http_forever(&url, request, reqwest::Method::GET) + .await?; + Ok(response) } } diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 4d62bc4ab5..7854fd9e36 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -663,6 +663,7 @@ mod test { use camino::Utf8Path; use hex_literal::hex; use pageserver_api::key::Key; + use pageserver_api::models::ShardImportStatus; use pageserver_api::shard::ShardIndex; use pageserver_api::upcall_api::ReAttachResponseTenant; use remote_storage::{RemoteStorageConfig, RemoteStorageKind}; @@ -792,10 +793,20 @@ mod test { &self, _tenant_shard_id: TenantShardId, _timeline_id: TimelineId, + _generation: Generation, _status: pageserver_api::models::ShardImportStatus, ) -> Result<(), RetryForeverError> { unimplemented!() } + + async fn get_timeline_import_status( + &self, + _tenant_shard_id: TenantShardId, + _timeline_id: TimelineId, + _generation: Generation, + ) -> Result { + unimplemented!() + } } async fn setup(test_name: &str) -> anyhow::Result { diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 8b6500b020..2edec9dda1 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -3500,6 +3500,107 @@ async fn put_tenant_timeline_import_wal( }.instrument(span).await } +/// Activate a timeline after its import has completed +/// +/// The endpoint is idempotent and callers are expected to retry all +/// errors until a successful response. +async fn activate_post_import_handler( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; + check_permission(&request, Some(tenant_shard_id.tenant_id))?; + + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + const DEFAULT_ACTIVATE_TIMEOUT: Duration = Duration::from_secs(1); + let activate_timeout = parse_query_param(&request, "timeline_activate_timeout_ms")? + .map(Duration::from_millis) + .unwrap_or(DEFAULT_ACTIVATE_TIMEOUT); + + let span = info_span!( + "activate_post_import_handler", + tenant_id=%tenant_shard_id.tenant_id, + timeline_id=%timeline_id, + shard_id=%tenant_shard_id.shard_slug() + ); + + async move { + let state = get_state(&request); + let tenant = state + .tenant_manager + .get_attached_tenant_shard(tenant_shard_id)?; + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + tenant + .finalize_importing_timeline(timeline_id) + .await + .map_err(ApiError::InternalServerError)?; + + match tenant.get_timeline(timeline_id, false) { + Ok(_timeline) => { + // Timeline is already visible. Reset not required: fall through. + } + Err(GetTimelineError::NotFound { .. }) => { + // This is crude: we reset the whole tenant such that the new timeline is detected + // and activated. We can come up with something more granular in the future. + // + // Note that we only reset the tenant if required: when the timeline is + // not present in [`Tenant::timelines`]. + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + state + .tenant_manager + .reset_tenant(tenant_shard_id, false, &ctx) + .await + .map_err(ApiError::InternalServerError)?; + } + Err(GetTimelineError::ShuttingDown) => { + return Err(ApiError::ShuttingDown); + } + Err(GetTimelineError::NotActive { .. }) => { + unreachable!("Called get_timeline with active_only=false"); + } + } + + let timeline = tenant.get_timeline(timeline_id, false)?; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn) + .with_scope_timeline(&timeline); + + let result = + tokio::time::timeout(activate_timeout, timeline.wait_to_become_active(&ctx)).await; + match result { + Ok(Ok(())) => { + // fallthrough + } + // Timeline reached some other state that's not active + // TODO(vlad): if the tenant is broken, return a permananet error + Ok(Err(_timeline_state)) => { + return Err(ApiError::InternalServerError(anyhow::anyhow!( + "Timeline activation failed" + ))); + } + // Activation timed out + Err(_) => { + return Err(ApiError::Timeout("Timeline activation timed out".into())); + } + } + + let timeline_info = build_timeline_info( + &timeline, false, // include_non_incremental_logical_size, + false, // force_await_initial_logical_size + &ctx, + ) + .await + .context("get local timeline info") + .map_err(ApiError::InternalServerError)?; + + json_response(StatusCode::OK, timeline_info) + } + .instrument(span) + .await +} + /// Read the end of a tar archive. /// /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each. @@ -3924,5 +4025,9 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal", |r| api_handler(r, put_tenant_timeline_import_wal), ) + .put( + "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/activate_post_import", + |r| api_handler(r, activate_post_import_handler), + ) .any(handler_404)) } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index bca1cb5b49..101e312ec3 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -1278,7 +1278,7 @@ impl PageServerHandler { } #[instrument(level = tracing::Level::DEBUG, skip_all)] - async fn pagesteam_handle_batched_message( + async fn pagestream_handle_batched_message( &mut self, pgb_writer: &mut PostgresBackend, batch: BatchedFeMessage, @@ -1733,7 +1733,7 @@ impl PageServerHandler { }; let result = self - .pagesteam_handle_batched_message( + .pagestream_handle_batched_message( pgb_writer, msg, io_concurrency.clone(), @@ -1909,7 +1909,7 @@ impl PageServerHandler { return Err(e); } }; - self.pagesteam_handle_batched_message( + self.pagestream_handle_batched_message( pgb_writer, batch, io_concurrency.clone(), diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index e59db74479..441049f47d 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -50,6 +50,7 @@ use remote_timeline_client::{ use secondary::heatmap::{HeatMapTenant, HeatMapTimeline}; use storage_broker::BrokerClientChannel; use timeline::compaction::{CompactionOutcome, GcCompactionQueue}; +use timeline::import_pgdata::ImportingTimeline; use timeline::offload::{OffloadError, offload_timeline}; use timeline::{ CompactFlags, CompactOptions, CompactionError, PreviousHeatmap, ShutdownMode, import_pgdata, @@ -284,6 +285,19 @@ pub struct TenantShard { /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating` timelines_offloaded: Mutex>>, + /// Tracks the timelines that are currently importing into this tenant shard. + /// + /// Note that importing timelines are also present in [`Self::timelines_creating`]. + /// Keep this in mind when ordering lock acquisition. + /// + /// Lifetime: + /// * An imported timeline is created while scanning the bucket on tenant attach + /// if the index part contains an `import_pgdata` entry and said field marks the import + /// as in progress. + /// * Imported timelines are removed when the storage controller calls the post timeline + /// import activation endpoint. + timelines_importing: std::sync::Mutex>, + /// The last tenant manifest known to be in remote storage. None if the manifest has not yet /// been either downloaded or uploaded. Always Some after tenant attach. /// @@ -923,19 +937,10 @@ enum StartCreatingTimelineResult { #[allow(clippy::large_enum_variant, reason = "TODO")] enum TimelineInitAndSyncResult { - ReadyToActivate(Arc), + ReadyToActivate, NeedsSpawnImportPgdata(TimelineInitAndSyncNeedsSpawnImportPgdata), } -impl TimelineInitAndSyncResult { - fn ready_to_activate(self) -> Option> { - match self { - Self::ReadyToActivate(timeline) => Some(timeline), - _ => None, - } - } -} - #[must_use] struct TimelineInitAndSyncNeedsSpawnImportPgdata { timeline: Arc, @@ -1012,10 +1017,6 @@ enum CreateTimelineCause { enum LoadTimelineCause { Attach, Unoffload, - ImportPgdata { - create_guard: TimelineCreateGuard, - activate: ActivateTimelineArgs, - }, } #[derive(thiserror::Error, Debug)] @@ -1097,7 +1098,7 @@ impl TenantShard { self: &Arc, timeline_id: TimelineId, resources: TimelineResources, - mut index_part: IndexPart, + index_part: IndexPart, metadata: TimelineMetadata, previous_heatmap: Option, ancestor: Option>, @@ -1106,7 +1107,7 @@ impl TenantShard { ) -> anyhow::Result { let tenant_id = self.tenant_shard_id; - let import_pgdata = index_part.import_pgdata.take(); + let import_pgdata = index_part.import_pgdata.clone(); let idempotency = match &import_pgdata { Some(import_pgdata) => { CreateTimelineIdempotency::ImportPgdata(CreatingTimelineIdempotencyImportPgdata { @@ -1127,7 +1128,7 @@ impl TenantShard { } }; - let (timeline, timeline_ctx) = self.create_timeline_struct( + let (timeline, _timeline_ctx) = self.create_timeline_struct( timeline_id, &metadata, previous_heatmap, @@ -1197,14 +1198,6 @@ impl TenantShard { match import_pgdata { Some(import_pgdata) if !import_pgdata.is_done() => { - match cause { - LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (), - LoadTimelineCause::ImportPgdata { .. } => { - unreachable!( - "ImportPgdata should not be reloading timeline import is done and persisted as such in s3" - ) - } - } let mut guard = self.timelines_creating.lock().unwrap(); if !guard.insert(timeline_id) { // We should never try and load the same timeline twice during startup @@ -1260,26 +1253,7 @@ impl TenantShard { "Timeline has no ancestor and no layer files" ); - match cause { - LoadTimelineCause::Attach | LoadTimelineCause::Unoffload => (), - LoadTimelineCause::ImportPgdata { - create_guard, - activate, - } => { - // TODO: see the comment in the task code above how I'm not so certain - // it is safe to activate here because of concurrent shutdowns. - match activate { - ActivateTimelineArgs::Yes { broker_client } => { - info!("activating timeline after reload from pgdata import task"); - timeline.activate(self.clone(), broker_client, None, &timeline_ctx); - } - ActivateTimelineArgs::No => (), - } - drop(create_guard); - } - } - - Ok(TimelineInitAndSyncResult::ReadyToActivate(timeline)) + Ok(TimelineInitAndSyncResult::ReadyToActivate) } } } @@ -1768,7 +1742,7 @@ impl TenantShard { })?; match effect { - TimelineInitAndSyncResult::ReadyToActivate(_) => { + TimelineInitAndSyncResult::ReadyToActivate => { // activation happens later, on Tenant::activate } TimelineInitAndSyncResult::NeedsSpawnImportPgdata( @@ -1778,13 +1752,24 @@ impl TenantShard { guard, }, ) => { - tokio::task::spawn(self.clone().create_timeline_import_pgdata_task( - timeline, - import_pgdata, - ActivateTimelineArgs::No, - guard, - ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn), - )); + let timeline_id = timeline.timeline_id; + let import_task_handle = + tokio::task::spawn(self.clone().create_timeline_import_pgdata_task( + timeline.clone(), + import_pgdata, + guard, + ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn), + )); + + let prev = self.timelines_importing.lock().unwrap().insert( + timeline_id, + ImportingTimeline { + timeline: timeline.clone(), + import_task_handle, + }, + ); + + assert!(prev.is_none()); } } } @@ -2678,14 +2663,7 @@ impl TenantShard { .await? } CreateTimelineParams::ImportPgdata(params) => { - self.create_timeline_import_pgdata( - params, - ActivateTimelineArgs::Yes { - broker_client: broker_client.clone(), - }, - ctx, - ) - .await? + self.create_timeline_import_pgdata(params, ctx).await? } }; @@ -2759,7 +2737,6 @@ impl TenantShard { async fn create_timeline_import_pgdata( self: &Arc, params: CreateTimelineParamsImportPgdata, - activate: ActivateTimelineArgs, ctx: &RequestContext, ) -> Result { let CreateTimelineParamsImportPgdata { @@ -2840,24 +2817,71 @@ impl TenantShard { let (timeline, timeline_create_guard) = uninit_timeline.finish_creation_myself(); - tokio::spawn(self.clone().create_timeline_import_pgdata_task( + let import_task_handle = tokio::spawn(self.clone().create_timeline_import_pgdata_task( timeline.clone(), index_part, - activate, timeline_create_guard, timeline_ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Warn), )); + let prev = self.timelines_importing.lock().unwrap().insert( + timeline.timeline_id, + ImportingTimeline { + timeline: timeline.clone(), + import_task_handle, + }, + ); + + // Idempotency is enforced higher up the stack + assert!(prev.is_none()); + // NB: the timeline doesn't exist in self.timelines at this point Ok(CreateTimelineResult::ImportSpawned(timeline)) } + /// Finalize the import of a timeline on this shard by marking it complete in + /// the index part. If the import task hasn't finished yet, returns an error. + /// + /// This method is idempotent. If the import was finalized once, the next call + /// will be a no-op. + pub(crate) async fn finalize_importing_timeline( + &self, + timeline_id: TimelineId, + ) -> anyhow::Result<()> { + let timeline = { + let locked = self.timelines_importing.lock().unwrap(); + match locked.get(&timeline_id) { + Some(importing_timeline) => { + if !importing_timeline.import_task_handle.is_finished() { + return Err(anyhow::anyhow!("Import task not done yet")); + } + + importing_timeline.timeline.clone() + } + None => { + return Ok(()); + } + } + }; + + timeline + .remote_client + .schedule_index_upload_for_import_pgdata_finalize()?; + timeline.remote_client.wait_completion().await?; + + self.timelines_importing + .lock() + .unwrap() + .remove(&timeline_id); + + Ok(()) + } + #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))] async fn create_timeline_import_pgdata_task( self: Arc, timeline: Arc, index_part: import_pgdata::index_part_format::Root, - activate: ActivateTimelineArgs, timeline_create_guard: TimelineCreateGuard, ctx: RequestContext, ) { @@ -2869,7 +2893,6 @@ impl TenantShard { .create_timeline_import_pgdata_task_impl( timeline, index_part, - activate, timeline_create_guard, ctx, ) @@ -2885,60 +2908,15 @@ impl TenantShard { self: Arc, timeline: Arc, index_part: import_pgdata::index_part_format::Root, - activate: ActivateTimelineArgs, - timeline_create_guard: TimelineCreateGuard, + _timeline_create_guard: TimelineCreateGuard, ctx: RequestContext, ) -> Result<(), anyhow::Error> { info!("importing pgdata"); + let ctx = ctx.with_scope_timeline(&timeline); import_pgdata::doit(&timeline, index_part, &ctx, self.cancel.clone()) .await .context("import")?; - info!("import done"); - - // - // Reload timeline from remote. - // This proves that the remote state is attachable, and it reuses the code. - // - // TODO: think about whether this is safe to do with concurrent TenantShard::shutdown. - // timeline_create_guard hols the tenant gate open, so, shutdown cannot _complete_ until we exit. - // But our activate() call might launch new background tasks after TenantShard::shutdown - // already went past shutting down the TenantShard::timelines, which this timeline here is no part of. - // I think the same problem exists with the bootstrap & branch mgmt API tasks (tenant shutting - // down while bootstrapping/branching + activating), but, the race condition is much more likely - // to manifest because of the long runtime of this import task. - - // in theory this shouldn't even .await anything except for coop yield - info!("shutting down timeline"); - timeline.shutdown(ShutdownMode::Hard).await; - info!("timeline shut down, reloading from remote"); - // TODO: we can't do the following check because create_timeline_import_pgdata must return an Arc - // let Some(timeline) = Arc::into_inner(timeline) else { - // anyhow::bail!("implementation error: timeline that we shut down was still referenced from somewhere"); - // }; - let timeline_id = timeline.timeline_id; - - // load from object storage like TenantShard::attach does - let resources = self.build_timeline_resources(timeline_id); - let index_part = resources - .remote_client - .download_index_file(&self.cancel) - .await?; - let index_part = match index_part { - MaybeDeletedIndexPart::Deleted(_) => { - // likely concurrent delete call, cplane should prevent this - anyhow::bail!( - "index part says deleted but we are not done creating yet, this should not happen but" - ) - } - MaybeDeletedIndexPart::IndexPart(p) => p, - }; - let metadata = index_part.metadata.clone(); - self - .load_remote_timeline(timeline_id, index_part, metadata, None, resources, LoadTimelineCause::ImportPgdata{ - create_guard: timeline_create_guard, activate, }, &ctx) - .await? - .ready_to_activate() - .context("implementation error: reloaded timeline still needs import after import reported success")?; + info!("import done - waiting for activation"); anyhow::Ok(()) } @@ -3475,6 +3453,14 @@ impl TenantShard { timeline.defuse_for_tenant_drop(); }); } + { + let mut timelines_importing = self.timelines_importing.lock().unwrap(); + timelines_importing + .drain() + .for_each(|(_timeline_id, importing_timeline)| { + importing_timeline.shutdown(); + }); + } // test_long_timeline_create_then_tenant_delete is leaning on this message tracing::info!("Waiting for timelines..."); while let Some(res) = js.join_next().await { @@ -3949,13 +3935,6 @@ where Ok(result) } -enum ActivateTimelineArgs { - Yes { - broker_client: storage_broker::BrokerClientChannel, - }, - No, -} - impl TenantShard { pub fn tenant_specific_overrides(&self) -> pageserver_api::models::TenantConfig { self.tenant_conf.load().tenant_conf.clone() @@ -4322,6 +4301,7 @@ impl TenantShard { timelines: Mutex::new(HashMap::new()), timelines_creating: Mutex::new(HashSet::new()), timelines_offloaded: Mutex::new(HashMap::new()), + timelines_importing: Mutex::new(HashMap::new()), remote_tenant_manifest: Default::default(), gc_cs: tokio::sync::Mutex::new(()), walredo_mgr, diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 8cf3c548c9..ed541c4f12 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -94,10 +94,23 @@ impl Header { pub enum WriteBlobError { #[error(transparent)] Flush(FlushTaskError), - #[error("blob too large ({len} bytes)")] - BlobTooLarge { len: usize }, #[error(transparent)] - WriteBlobRaw(anyhow::Error), + Other(anyhow::Error), +} + +impl WriteBlobError { + pub fn is_cancel(&self) -> bool { + match self { + WriteBlobError::Flush(e) => e.is_cancel(), + WriteBlobError::Other(_) => false, + } + } + pub fn into_anyhow(self) -> anyhow::Error { + match self { + WriteBlobError::Flush(e) => e.into_anyhow(), + WriteBlobError::Other(e) => e, + } + } } impl BlockCursor<'_> { @@ -327,7 +340,9 @@ where return ( ( io_buf.slice_len(), - Err(WriteBlobError::BlobTooLarge { len }), + Err(WriteBlobError::Other(anyhow::anyhow!( + "blob too large ({len} bytes)" + ))), ), srcbuf, ); @@ -391,7 +406,7 @@ where // Verify the header, to ensure we don't write invalid/corrupt data. let header = match Header::decode(&raw_with_header) .context("decoding blob header") - .map_err(WriteBlobError::WriteBlobRaw) + .map_err(WriteBlobError::Other) { Ok(header) => header, Err(err) => return (raw_with_header, Err(err)), @@ -401,7 +416,7 @@ where let raw_len = raw_with_header.len(); return ( raw_with_header, - Err(WriteBlobError::WriteBlobRaw(anyhow::anyhow!( + Err(WriteBlobError::Other(anyhow::anyhow!( "header length mismatch: {header_total_len} != {raw_len}" ))), ); diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index ea29f51956..21d68495f7 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -949,6 +949,35 @@ impl RemoteTimelineClient { Ok(()) } + /// If the `import_pgdata` field marks the timeline as having an import in progress, + /// launch an index-file upload operation that transitions it to done in the background + pub(crate) fn schedule_index_upload_for_import_pgdata_finalize( + self: &Arc, + ) -> anyhow::Result<()> { + use import_pgdata::index_part_format; + + let mut guard = self.upload_queue.lock().unwrap(); + let upload_queue = guard.initialized_mut()?; + let to_update = match &upload_queue.dirty.import_pgdata { + Some(import) if !import.is_done() => Some(import), + Some(_) | None => None, + }; + + if let Some(old) = to_update { + let new = + index_part_format::Root::V1(index_part_format::V1::Done(index_part_format::Done { + idempotency_key: old.idempotency_key().clone(), + started_at: *old.started_at(), + finished_at: chrono::Utc::now().naive_utc(), + })); + + upload_queue.dirty.import_pgdata = Some(new); + self.schedule_index_upload(upload_queue); + } + + Ok(()) + } + /// Launch an index-file upload operation in the background, setting `gc_compaction_state` field. pub(crate) fn schedule_index_upload_for_gc_compaction_state_update( self: &Arc, diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index c26b7626ef..dd49c843f3 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -668,7 +668,9 @@ impl From for UpdateError { impl From for UpdateError { fn from(value: std::io::Error) -> Self { - if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) { + if let Some(nix::errno::Errno::ENOSPC) = + value.raw_os_error().map(nix::errno::Errno::from_raw) + { UpdateError::NoSpace } else if value .get_ref() diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 796ad01e54..5dfa961b71 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -2,6 +2,7 @@ pub mod batch_split_writer; pub mod delta_layer; +pub mod errors; pub mod filter_iterator; pub mod image_layer; pub mod inmemory_layer; diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index 39cd02d101..51f2e909a2 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -10,6 +10,7 @@ use utils::id::TimelineId; use utils::lsn::Lsn; use utils::shard::TenantShardId; +use super::errors::PutError; use super::layer::S3_UPLOAD_LIMIT; use super::{ DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer, @@ -235,7 +236,7 @@ impl<'a> SplitImageLayerWriter<'a> { key: Key, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { // The current estimation is an upper bound of the space that the key/image could take // because we did not consider compression in this estimation. The resulting image layer // could be smaller than the target size. @@ -253,7 +254,8 @@ impl<'a> SplitImageLayerWriter<'a> { self.cancel.clone(), ctx, ) - .await?; + .await + .map_err(PutError::Other)?; let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer); self.batches.add_unfinished_image_writer( prev_image_writer, @@ -346,7 +348,7 @@ impl<'a> SplitDeltaLayerWriter<'a> { lsn: Lsn, val: Value, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate // number, and therefore the final layer size could be a little bit larger or smaller than the target. // @@ -366,7 +368,8 @@ impl<'a> SplitDeltaLayerWriter<'a> { self.cancel.clone(), ctx, ) - .await?, + .await + .map_err(PutError::Other)?, )); } let (_, inner) = self.inner.as_mut().unwrap(); @@ -386,7 +389,8 @@ impl<'a> SplitDeltaLayerWriter<'a> { self.cancel.clone(), ctx, ) - .await?; + .await + .map_err(PutError::Other)?; let (start_key, prev_delta_writer) = self.inner.replace((key, next_delta_writer)).unwrap(); self.batches.add_unfinished_delta_writer( @@ -396,11 +400,11 @@ impl<'a> SplitDeltaLayerWriter<'a> { ); } else if inner.estimated_size() >= S3_UPLOAD_LIMIT { // We have to produce a very large file b/c a key is updated too often. - anyhow::bail!( + return Err(PutError::Other(anyhow::anyhow!( "a single key is updated too often: key={}, estimated_size={}, and the layer file cannot be produced", key, inner.estimated_size() - ); + ))); } } self.last_key_written = key; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 11875ac653..2c1b27c8d5 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -55,6 +55,7 @@ use utils::bin_ser::SerializeError; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; +use super::errors::PutError; use super::{ AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, ValuesReconstructState, @@ -477,12 +478,15 @@ impl DeltaLayerWriterInner { lsn: Lsn, val: Value, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { let (_, res) = self .put_value_bytes( key, lsn, - Value::ser(&val)?.slice_len(), + Value::ser(&val) + .map_err(anyhow::Error::new) + .map_err(PutError::Other)? + .slice_len(), val.will_init(), ctx, ) @@ -497,7 +501,7 @@ impl DeltaLayerWriterInner { val: FullSlice, will_init: bool, ctx: &RequestContext, - ) -> (FullSlice, anyhow::Result<()>) + ) -> (FullSlice, Result<(), PutError>) where Buf: IoBuf + Send, { @@ -513,19 +517,24 @@ impl DeltaLayerWriterInner { .blob_writer .write_blob_maybe_compressed(val, ctx, compression) .await; + let res = res.map_err(PutError::WriteBlob); let off = match res { Ok((off, _)) => off, - Err(e) => return (val, Err(anyhow::anyhow!(e))), + Err(e) => return (val, Err(e)), }; let blob_ref = BlobRef::new(off, will_init); let delta_key = DeltaKey::from_key_lsn(&key, lsn); - let res = self.tree.append(&delta_key.0, blob_ref.0); + let res = self + .tree + .append(&delta_key.0, blob_ref.0) + .map_err(anyhow::Error::new) + .map_err(PutError::Other); self.num_keys += 1; - (val, res.map_err(|e| anyhow::anyhow!(e))) + (val, res) } fn size(&self) -> u64 { @@ -694,7 +703,7 @@ impl DeltaLayerWriter { lsn: Lsn, val: Value, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { self.inner .as_mut() .unwrap() @@ -709,7 +718,7 @@ impl DeltaLayerWriter { val: FullSlice, will_init: bool, ctx: &RequestContext, - ) -> (FullSlice, anyhow::Result<()>) + ) -> (FullSlice, Result<(), PutError>) where Buf: IoBuf + Send, { diff --git a/pageserver/src/tenant/storage_layer/errors.rs b/pageserver/src/tenant/storage_layer/errors.rs new file mode 100644 index 0000000000..591e489faa --- /dev/null +++ b/pageserver/src/tenant/storage_layer/errors.rs @@ -0,0 +1,24 @@ +use crate::tenant::blob_io::WriteBlobError; + +#[derive(Debug, thiserror::Error)] +pub enum PutError { + #[error(transparent)] + WriteBlob(WriteBlobError), + #[error(transparent)] + Other(anyhow::Error), +} + +impl PutError { + pub fn is_cancel(&self) -> bool { + match self { + PutError::WriteBlob(e) => e.is_cancel(), + PutError::Other(_) => false, + } + } + pub fn into_anyhow(self) -> anyhow::Error { + match self { + PutError::WriteBlob(e) => e.into_anyhow(), + PutError::Other(e) => e, + } + } +} diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index d684230572..740f53f928 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -53,6 +53,7 @@ use utils::bin_ser::SerializeError; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; +use super::errors::PutError; use super::layer_name::ImageLayerName; use super::{ AsLayerDesc, LayerName, OnDiskValue, OnDiskValueIo, PersistentLayerDesc, ResidentLayer, @@ -842,8 +843,14 @@ impl ImageLayerWriterInner { key: Key, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { - ensure!(self.key_range.contains(&key)); + ) -> Result<(), PutError> { + if !self.key_range.contains(&key) { + return Err(PutError::Other(anyhow::anyhow!( + "key {:?} not in range {:?}", + key, + self.key_range + ))); + } let compression = self.conf.image_compression; let uncompressed_len = img.len() as u64; self.uncompressed_bytes += uncompressed_len; @@ -853,7 +860,7 @@ impl ImageLayerWriterInner { .write_blob_maybe_compressed(img.slice_len(), ctx, compression) .await; // TODO: re-use the buffer for `img` further upstack - let (off, compression_info) = res?; + let (off, compression_info) = res.map_err(PutError::WriteBlob)?; if compression_info.compressed_size.is_some() { // The image has been considered for compression at least self.uncompressed_bytes_eligible += uncompressed_len; @@ -865,7 +872,10 @@ impl ImageLayerWriterInner { let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; key.write_to_byte_slice(&mut keybuf); - self.tree.append(&keybuf, off)?; + self.tree + .append(&keybuf, off) + .map_err(anyhow::Error::new) + .map_err(PutError::Other)?; #[cfg(feature = "testing")] { @@ -1085,7 +1095,7 @@ impl ImageLayerWriter { key: Key, img: Bytes, ctx: &RequestContext, - ) -> anyhow::Result<()> { + ) -> Result<(), PutError> { self.inner.as_mut().unwrap().put_image(key, img, ctx).await } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 1112a5330b..4709a6d616 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -340,7 +340,7 @@ pub(crate) fn log_compaction_error( } else { match level { Level::ERROR if degrade_to_warning => warn!("Compaction failed and discarded: {err:#}"), - Level::ERROR => error!("Compaction failed: {err:#}"), + Level::ERROR => error!("Compaction failed: {err:?}"), Level::INFO => info!("Compaction failed: {err:#}"), level => unimplemented!("unexpected level {level:?}"), } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f3d01b7605..c1d7b91cf4 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -987,6 +987,16 @@ impl From for CreateImageLayersError { } } +impl From for CreateImageLayersError { + fn from(e: super::storage_layer::errors::PutError) -> Self { + if e.is_cancel() { + CreateImageLayersError::Cancelled + } else { + CreateImageLayersError::Other(e.into_anyhow()) + } + } +} + impl From for CreateImageLayersError { fn from(e: GetVectoredError) -> Self { match e { @@ -2117,22 +2127,14 @@ impl Timeline { debug_assert_current_span_has_tenant_and_timeline_id(); // Regardless of whether we're going to try_freeze_and_flush - // or not, stop ingesting any more data. Walreceiver only provides - // cancellation but no "wait until gone", because it uses the Timeline::gate. - // So, only after the self.gate.close() below will we know for sure that - // no walreceiver tasks are left. - // For `try_freeze_and_flush=true`, this means that we might still be ingesting - // data during the call to `self.freeze_and_flush()` below. - // That's not ideal, but, we don't have the concept of a ChildGuard, - // which is what we'd need to properly model early shutdown of the walreceiver - // task sub-tree before the other Timeline task sub-trees. + // or not, stop ingesting any more data. let walreceiver = self.walreceiver.lock().unwrap().take(); tracing::debug!( is_some = walreceiver.is_some(), "Waiting for WalReceiverManager..." ); if let Some(walreceiver) = walreceiver { - walreceiver.cancel(); + walreceiver.shutdown().await; } // ... and inform any waiters for newer LSNs that there won't be any. self.last_record_lsn.shutdown(); @@ -5930,6 +5932,16 @@ impl From for CompactionError { } } +impl From for CompactionError { + fn from(e: super::storage_layer::errors::PutError) -> Self { + if e.is_cancel() { + CompactionError::ShuttingDown + } else { + CompactionError::Other(e.into_anyhow()) + } + } +} + #[serde_as] #[derive(serde::Serialize)] struct RecordedDuration(#[serde_as(as = "serde_with::DurationMicroSeconds")] Duration); diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index d0c13d86ce..37c1a8f60c 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -1277,6 +1277,8 @@ impl Timeline { return Ok(CompactionOutcome::YieldForL0); } + let gc_cutoff = *self.applied_gc_cutoff_lsn.read(); + // 2. Repartition and create image layers if necessary match self .repartition( @@ -1287,7 +1289,7 @@ impl Timeline { ) .await { - Ok(((dense_partitioning, sparse_partitioning), lsn)) => { + Ok(((dense_partitioning, sparse_partitioning), lsn)) if lsn >= gc_cutoff => { // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them let image_ctx = RequestContextBuilder::from(ctx) .access_stats_behavior(AccessStatsBehavior::Skip) @@ -1341,6 +1343,10 @@ impl Timeline { } } + Ok(_) => { + info!("skipping repartitioning due to image compaction LSN being below GC cutoff"); + } + // Suppress errors when cancelled. Err(_) if self.cancel.is_cancelled() => {} Err(err) if err.is_cancel() => {} @@ -2204,8 +2210,7 @@ impl Timeline { .as_mut() .unwrap() .put_value(key, lsn, value, ctx) - .await - .map_err(CompactionError::Other)?; + .await?; } else { let owner = self.shard_identity.get_shard_number(&key); @@ -3430,6 +3435,7 @@ impl Timeline { // Step 2: Produce images+deltas. let mut accumulated_values = Vec::new(); + let mut accumulated_values_estimated_size = 0; let mut last_key: Option = None; // Only create image layers when there is no ancestor branches. TODO: create covering image layer @@ -3606,7 +3612,18 @@ impl Timeline { if last_key.is_none() { last_key = Some(key); } + accumulated_values_estimated_size += val.estimated_size(); accumulated_values.push((key, lsn, val)); + + // Accumulated values should never exceed 512MB. + if accumulated_values_estimated_size >= 1024 * 1024 * 512 { + return Err(CompactionError::Other(anyhow!( + "too many values for a single key: {} for key {}, {} items", + accumulated_values_estimated_size, + key, + accumulated_values.len() + ))); + } } else { let last_key: &mut Key = last_key.as_mut().unwrap(); stat.on_unique_key_visited(); // TODO: adjust statistics for partial compaction @@ -3639,6 +3656,7 @@ impl Timeline { .map_err(CompactionError::Other)?; accumulated_values.clear(); *last_key = key; + accumulated_values_estimated_size = val.estimated_size(); accumulated_values.push((key, lsn, val)); } } diff --git a/pageserver/src/tenant/timeline/import_pgdata.rs b/pageserver/src/tenant/timeline/import_pgdata.rs index 6ab6b90cb6..658d867c18 100644 --- a/pageserver/src/tenant/timeline/import_pgdata.rs +++ b/pageserver/src/tenant/timeline/import_pgdata.rs @@ -1,8 +1,10 @@ use std::sync::Arc; use anyhow::{Context, bail}; +use importbucket_client::{ControlFile, RemoteStorageWrapper}; use pageserver_api::models::ShardImportStatus; use remote_storage::RemotePath; +use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::info; use utils::lsn::Lsn; @@ -17,6 +19,17 @@ mod importbucket_client; mod importbucket_format; pub(crate) mod index_part_format; +pub(crate) struct ImportingTimeline { + pub import_task_handle: JoinHandle<()>, + pub timeline: Arc, +} + +impl ImportingTimeline { + pub(crate) fn shutdown(self) { + self.import_task_handle.abort(); + } +} + pub async fn doit( timeline: &Arc, index_part: index_part_format::Root, @@ -26,180 +39,225 @@ pub async fn doit( let index_part_format::Root::V1(v1) = index_part; let index_part_format::InProgress { location, - idempotency_key, - started_at, + idempotency_key: _, + started_at: _, } = match v1 { index_part_format::V1::Done(_) => return Ok(()), index_part_format::V1::InProgress(in_progress) => in_progress, }; - let storage = importbucket_client::new(timeline.conf, &location, cancel.clone()).await?; + let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel); - let status_prefix = RemotePath::from_string("status").unwrap(); - - // - // See if shard is done. - // TODO: incorporate generations into status key for split brain safety. Figure out together with checkpointing. - // - let shard_status_key = - status_prefix.join(format!("shard-{}", timeline.tenant_shard_id.shard_slug())); - let shard_status: Option = - storage.get_json(&shard_status_key).await?; - info!(?shard_status, "peeking shard status"); - if shard_status.map(|st| st.done).unwrap_or(false) { - info!("shard status indicates that the shard is done, skipping import"); - } else { - // TODO: checkpoint the progress into the IndexPart instead of restarting - // from the beginning. - - // - // Wipe the slate clean - the flow does not allow resuming. - // We can implement resuming in the future by checkpointing the progress into the IndexPart. - // - info!("wipe the slate clean"); - { - // TODO: do we need to hold GC lock for this? - let mut guard = timeline.layers.write().await; - assert!( - guard.layer_map()?.open_layer.is_none(), - "while importing, there should be no in-memory layer" // this just seems like a good place to assert it - ); - let all_layers_keys = guard.all_persistent_layers(); - let all_layers: Vec<_> = all_layers_keys - .iter() - .map(|key| guard.get_from_key(key)) - .collect(); - let open = guard.open_mut().context("open_mut")?; - - timeline.remote_client.schedule_gc_update(&all_layers)?; - open.finish_gc_timeline(&all_layers); - } - - // - // Wait for pgdata to finish uploading - // - info!("wait for pgdata to reach status 'done'"); - let pgdata_status_key = status_prefix.join("pgdata"); - loop { - let res = async { - let pgdata_status: Option = storage - .get_json(&pgdata_status_key) - .await - .context("get pgdata status")?; - info!(?pgdata_status, "peeking pgdata status"); - if pgdata_status.map(|st| st.done).unwrap_or(false) { - Ok(()) - } else { - Err(anyhow::anyhow!("pgdata not done yet")) - } - } - .await; - match res { - Ok(_) => break, - Err(err) => { - info!(?err, "indefinitely waiting for pgdata to finish"); - if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled()) - .await - .is_ok() - { - bail!("cancelled while waiting for pgdata"); - } - } - } - } - - // - // Do the import - // - info!("do the import"); - let control_file = storage.get_control_file().await?; - let base_lsn = control_file.base_lsn(); - - info!("update TimelineMetadata based on LSNs from control file"); - { - let pg_version = control_file.pg_version(); - let _ctx: &RequestContext = ctx; - async move { - // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the - // checkpoint record, and prev_record_lsn should point to its beginning. - // We should read the real end of the record from the WAL, but here we - // just fake it. - let disk_consistent_lsn = Lsn(base_lsn.0 + 8); - let prev_record_lsn = base_lsn; - let metadata = TimelineMetadata::new( - disk_consistent_lsn, - Some(prev_record_lsn), - None, // no ancestor - Lsn(0), // no ancestor lsn - base_lsn, // latest_gc_cutoff_lsn - base_lsn, // initdb_lsn - pg_version, - ); - - let _start_lsn = disk_consistent_lsn + 1; - - timeline - .remote_client - .schedule_index_upload_for_full_metadata_update(&metadata)?; - - timeline.remote_client.wait_completion().await?; - - anyhow::Ok(()) - } - } - .await?; - - flow::run( - timeline.clone(), - base_lsn, - control_file, - storage.clone(), - ctx, + let shard_status = storcon_client + .get_timeline_import_status( + timeline.tenant_shard_id, + timeline.timeline_id, + timeline.generation, ) - .await?; + .await + .map_err(|_err| anyhow::anyhow!("Shut down while getting timeline import status"))?; - // - // Communicate that shard is done. - // Ensure at-least-once delivery of the upcall to storage controller - // before we mark the task as done and never come here again. - // - let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel); - storcon_client - .put_timeline_import_status( - timeline.tenant_shard_id, - timeline.timeline_id, - // TODO(vlad): What about import errors? - ShardImportStatus::Done, - ) - .await - .map_err(|_err| anyhow::anyhow!("Shut down while putting timeline import status"))?; + info!(?shard_status, "peeking shard status"); + match shard_status { + ShardImportStatus::InProgress(maybe_progress) => { + let storage = + importbucket_client::new(timeline.conf, &location, cancel.clone()).await?; - storage - .put_json( - &shard_status_key, - &importbucket_format::ShardStatus { done: true }, + let control_file_res = if maybe_progress.is_none() { + // Only prepare the import once when there's no progress. + prepare_import(timeline, storage.clone(), &cancel).await + } else { + storage.get_control_file().await + }; + + let control_file = match control_file_res { + Ok(cf) => cf, + Err(err) => { + return Err( + terminate_flow_with_error(timeline, err, &storcon_client, &cancel).await, + ); + } + }; + + let res = flow::run( + timeline.clone(), + control_file, + storage.clone(), + maybe_progress, + ctx, ) - .await - .context("put shard status")?; + .await; + if let Err(err) = res { + return Err( + terminate_flow_with_error(timeline, err, &storcon_client, &cancel).await, + ); + } + + // Communicate that shard is done. + // Ensure at-least-once delivery of the upcall to storage controller + // before we mark the task as done and never come here again. + // + // Note that we do not mark the import complete in the index part now. + // This happens in [`Tenant::finalize_importing_timeline`] in response + // to the storage controller calling + // `/v1/tenant/:tenant_id/timeline/:timeline_id/activate_post_import`. + storcon_client + .put_timeline_import_status( + timeline.tenant_shard_id, + timeline.timeline_id, + timeline.generation, + ShardImportStatus::Done, + ) + .await + .map_err(|_err| { + anyhow::anyhow!("Shut down while putting timeline import status") + })?; + } + ShardImportStatus::Error(err) => { + info!( + "shard status indicates that the shard is done (error), skipping import {}", + err + ); + } + ShardImportStatus::Done => { + info!("shard status indicates that the shard is done (success), skipping import"); + } } - // - // Mark as done in index_part. - // This makes subsequent timeline loads enter the normal load code path - // instead of spawning the import task and calling this here function. - // - info!("mark import as complete in index part"); - timeline - .remote_client - .schedule_index_upload_for_import_pgdata_state_update(Some(index_part_format::Root::V1( - index_part_format::V1::Done(index_part_format::Done { - idempotency_key, - started_at, - finished_at: chrono::Utc::now().naive_utc(), - }), - )))?; - - timeline.remote_client.wait_completion().await?; - Ok(()) } + +async fn prepare_import( + timeline: &Arc, + storage: RemoteStorageWrapper, + cancel: &CancellationToken, +) -> anyhow::Result { + // Wipe the slate clean before starting the import as a precaution. + // This method is only called when there's no recorded checkpoint for the import + // in the storage controller. + // + // Note that this is split-brain safe (two imports for same timeline shards running in + // different generations) because we go through the usual deletion path, including deletion queue. + info!("wipe the slate clean"); + { + // TODO: do we need to hold GC lock for this? + let mut guard = timeline.layers.write().await; + assert!( + guard.layer_map()?.open_layer.is_none(), + "while importing, there should be no in-memory layer" // this just seems like a good place to assert it + ); + let all_layers_keys = guard.all_persistent_layers(); + let all_layers: Vec<_> = all_layers_keys + .iter() + .map(|key| guard.get_from_key(key)) + .collect(); + let open = guard.open_mut().context("open_mut")?; + + timeline.remote_client.schedule_gc_update(&all_layers)?; + open.finish_gc_timeline(&all_layers); + } + + // + // Wait for pgdata to finish uploading + // + info!("wait for pgdata to reach status 'done'"); + let status_prefix = RemotePath::from_string("status").unwrap(); + let pgdata_status_key = status_prefix.join("pgdata"); + loop { + let res = async { + let pgdata_status: Option = storage + .get_json(&pgdata_status_key) + .await + .context("get pgdata status")?; + info!(?pgdata_status, "peeking pgdata status"); + if pgdata_status.map(|st| st.done).unwrap_or(false) { + Ok(()) + } else { + Err(anyhow::anyhow!("pgdata not done yet")) + } + } + .await; + match res { + Ok(_) => break, + Err(err) => { + info!(?err, "indefinitely waiting for pgdata to finish"); + if tokio::time::timeout(std::time::Duration::from_secs(10), cancel.cancelled()) + .await + .is_ok() + { + bail!("cancelled while waiting for pgdata"); + } + } + } + } + + let control_file = storage.get_control_file().await?; + let base_lsn = control_file.base_lsn(); + + info!("update TimelineMetadata based on LSNs from control file"); + { + let pg_version = control_file.pg_version(); + async move { + // FIXME: The 'disk_consistent_lsn' should be the LSN at the *end* of the + // checkpoint record, and prev_record_lsn should point to its beginning. + // We should read the real end of the record from the WAL, but here we + // just fake it. + let disk_consistent_lsn = Lsn(base_lsn.0 + 8); + let prev_record_lsn = base_lsn; + let metadata = TimelineMetadata::new( + disk_consistent_lsn, + Some(prev_record_lsn), + None, // no ancestor + Lsn(0), // no ancestor lsn + base_lsn, // latest_gc_cutoff_lsn + base_lsn, // initdb_lsn + pg_version, + ); + + let _start_lsn = disk_consistent_lsn + 1; + + timeline + .remote_client + .schedule_index_upload_for_full_metadata_update(&metadata)?; + + timeline.remote_client.wait_completion().await?; + + anyhow::Ok(()) + } + } + .await?; + + Ok(control_file) +} + +async fn terminate_flow_with_error( + timeline: &Arc, + error: anyhow::Error, + storcon_client: &StorageControllerUpcallClient, + cancel: &CancellationToken, +) -> anyhow::Error { + // The import task is a aborted on tenant shutdown, so in principle, it should + // never be cancelled. To be on the safe side, check the cancellation tokens + // before marking the import as failed. + if !(cancel.is_cancelled() || timeline.cancel.is_cancelled()) { + let notify_res = storcon_client + .put_timeline_import_status( + timeline.tenant_shard_id, + timeline.timeline_id, + timeline.generation, + ShardImportStatus::Error(format!("{error:#}")), + ) + .await; + + if let Err(_notify_error) = notify_res { + // The [`StorageControllerUpcallClient::put_timeline_import_status`] retries + // forever internally, so errors returned by it can only be due to cancellation. + info!("failed to notify storcon about permanent import error"); + } + + // Will be logged by [`Tenant::create_timeline_import_pgdata_task`] + error + } else { + anyhow::anyhow!("Import task cancelled") + } +} diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs index c6d2944769..3e10a4e6d6 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs @@ -29,71 +29,127 @@ //! - version-specific CheckPointData (=> pgv abstraction, already exists for regular walingest) use std::collections::HashSet; +use std::hash::{Hash, Hasher}; use std::ops::Range; use std::sync::Arc; -use anyhow::{bail, ensure}; +use anyhow::ensure; use bytes::Bytes; +use futures::stream::FuturesOrdered; use itertools::Itertools; +use pageserver_api::config::TimelineImportConfig; use pageserver_api::key::{ CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, Key, TWOPHASEDIR_KEY, rel_block_to_key, rel_dir_to_key, rel_size_to_key, relmap_file_key, slru_block_to_key, slru_dir_to_key, slru_segment_size_to_key, }; use pageserver_api::keyspace::{contiguous_range_len, is_contiguous_range, singleton_range}; +use pageserver_api::models::{ShardImportProgress, ShardImportProgressV1, ShardImportStatus}; use pageserver_api::reltag::{RelTag, SlruKind}; use pageserver_api::shard::ShardIdentity; use postgres_ffi::relfile_utils::parse_relfilename; use postgres_ffi::{BLCKSZ, pg_constants}; use remote_storage::RemotePath; -use tokio::task::JoinSet; -use tracing::{Instrument, debug, info_span, instrument}; +use tokio::sync::Semaphore; +use tokio_stream::StreamExt; +use tracing::{debug, instrument}; use utils::bin_ser::BeSer; use utils::lsn::Lsn; +use utils::pausable_failpoint; use super::Timeline; use super::importbucket_client::{ControlFile, RemoteStorageWrapper}; use crate::assert_u64_eq_usize::UsizeIsU64; use crate::context::{DownloadBehavior, RequestContext}; +use crate::controller_upcall_client::{StorageControllerUpcallApi, StorageControllerUpcallClient}; use crate::pgdatadir_mapping::{ DbDirectory, RelDirectory, SlruSegmentDirectory, TwoPhaseDirectory, }; use crate::task_mgr::TaskKind; -use crate::tenant::storage_layer::{ImageLayerWriter, Layer}; +use crate::tenant::storage_layer::{AsLayerDesc, ImageLayerWriter, Layer}; pub async fn run( timeline: Arc, - pgdata_lsn: Lsn, control_file: ControlFile, storage: RemoteStorageWrapper, + import_progress: Option, ctx: &RequestContext, ) -> anyhow::Result<()> { - Flow { - timeline, - pgdata_lsn, - control_file, - tasks: Vec::new(), - storage, + // Match how we run the import based on the progress version. + // If there's no import progress, it means that this is a new import + // and we can use whichever version we want. + match import_progress { + Some(ShardImportProgress::V1(progress)) => { + run_v1(timeline, control_file, storage, Some(progress), ctx).await + } + None => run_v1(timeline, control_file, storage, None, ctx).await, } - .run(ctx) - .await } -struct Flow { +async fn run_v1( timeline: Arc, - pgdata_lsn: Lsn, control_file: ControlFile, - tasks: Vec, storage: RemoteStorageWrapper, + import_progress: Option, + ctx: &RequestContext, +) -> anyhow::Result<()> { + let planner = Planner { + control_file, + storage: storage.clone(), + shard: timeline.shard_identity, + tasks: Vec::default(), + }; + + let import_config = &timeline.conf.timeline_import_config; + let plan = planner.plan(import_config).await?; + + // Hash the plan and compare with the hash of the plan we got back from the storage controller. + // If the two match, it means that the planning stage had the same output. + // + // This is not intended to be a cryptographically secure hash. + const SEED: u64 = 42; + let mut hasher = twox_hash::XxHash64::with_seed(SEED); + plan.hash(&mut hasher); + let plan_hash = hasher.finish(); + + if let Some(progress) = &import_progress { + if plan_hash != progress.import_plan_hash { + anyhow::bail!("Import plan does not match storcon metadata"); + } + + // Handle collisions on jobs of unequal length + if progress.jobs != plan.jobs.len() { + anyhow::bail!("Import plan job length does not match storcon metadata") + } + } + + pausable_failpoint!("import-timeline-pre-execute-pausable"); + + let start_from_job_idx = import_progress.map(|progress| progress.completed); + plan.execute(timeline, start_from_job_idx, plan_hash, import_config, ctx) + .await } -impl Flow { - /// Perform the ingestion into [`Self::timeline`]. - /// Assumes the timeline is empty (= no layers). - pub async fn run(mut self, ctx: &RequestContext) -> anyhow::Result<()> { - let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align(); +struct Planner { + control_file: ControlFile, + storage: RemoteStorageWrapper, + shard: ShardIdentity, + tasks: Vec, +} - self.pgdata_lsn = pgdata_lsn; +#[derive(Hash)] +struct Plan { + jobs: Vec, + // Included here such that it ends up in the hash for the plan + shard: ShardIdentity, +} + +impl Planner { + /// Creates an import plan + /// + /// This function is and must remain pure: given the same input, it will generate the same import plan. + async fn plan(mut self, import_config: &TimelineImportConfig) -> anyhow::Result { + let pgdata_lsn = Lsn(self.control_file.control_file_data().checkPoint).align(); let datadir = PgDataDir::new(&self.storage).await?; @@ -115,7 +171,7 @@ impl Flow { } // Import SLRUs - if self.timeline.tenant_shard_id.is_shard_zero() { + if self.shard.is_shard_zero() { // pg_xact (01:00 keyspace) self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact")) .await?; @@ -166,14 +222,16 @@ impl Flow { let mut last_end_key = Key::MIN; let mut current_chunk = Vec::new(); let mut current_chunk_size: usize = 0; - let mut parallel_jobs = Vec::new(); + let mut jobs = Vec::new(); for task in std::mem::take(&mut self.tasks).into_iter() { - if current_chunk_size + task.total_size() > 1024 * 1024 * 1024 { + if current_chunk_size + task.total_size() + > import_config.import_job_soft_size_limit.into() + { let key_range = last_end_key..task.key_range().start; - parallel_jobs.push(ChunkProcessingJob::new( + jobs.push(ChunkProcessingJob::new( key_range.clone(), std::mem::take(&mut current_chunk), - &self, + pgdata_lsn, )); last_end_key = key_range.end; current_chunk_size = 0; @@ -181,45 +239,16 @@ impl Flow { current_chunk_size += task.total_size(); current_chunk.push(task); } - parallel_jobs.push(ChunkProcessingJob::new( + jobs.push(ChunkProcessingJob::new( last_end_key..Key::MAX, current_chunk, - &self, + pgdata_lsn, )); - // Start all jobs simultaneosly - let mut work = JoinSet::new(); - // TODO: semaphore? - for job in parallel_jobs { - let ctx: RequestContext = - ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error); - work.spawn(async move { job.run(&ctx).await }.instrument(info_span!("parallel_job"))); - } - let mut results = Vec::new(); - while let Some(result) = work.join_next().await { - match result { - Ok(res) => { - results.push(res); - } - Err(_joinset_err) => { - results.push(Err(anyhow::anyhow!( - "parallel job panicked or cancelled, check pageserver logs" - ))); - } - } - } - - if results.iter().all(|r| r.is_ok()) { - Ok(()) - } else { - let mut msg = String::new(); - for result in results { - if let Err(err) = result { - msg.push_str(&format!("{err:?}\n\n")); - } - } - bail!("Some parallel jobs failed:\n\n{msg}"); - } + Ok(Plan { + jobs, + shard: self.shard, + }) } #[instrument(level = tracing::Level::DEBUG, skip_all, fields(dboid=%db.dboid, tablespace=%db.spcnode, path=%db.path))] @@ -266,7 +295,7 @@ impl Flow { let end_key = rel_block_to_key(file.rel_tag, start_blk + (len / 8192) as u32); self.tasks .push(AnyImportTask::RelBlocks(ImportRelBlocksTask::new( - *self.timeline.get_shard_identity(), + self.shard, start_key..end_key, &file.path, self.storage.clone(), @@ -289,7 +318,7 @@ impl Flow { } async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> { - assert!(self.timeline.tenant_shard_id.is_shard_zero()); + assert!(self.shard.is_shard_zero()); let segments = self.storage.listfilesindir(path).await?; let segments: Vec<(String, u32, usize)> = segments @@ -344,6 +373,100 @@ impl Flow { } } +impl Plan { + async fn execute( + self, + timeline: Arc, + start_after_job_idx: Option, + import_plan_hash: u64, + import_config: &TimelineImportConfig, + ctx: &RequestContext, + ) -> anyhow::Result<()> { + let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &timeline.cancel); + + let mut work = FuturesOrdered::new(); + let semaphore = Arc::new(Semaphore::new(import_config.import_job_concurrency.into())); + + let jobs_in_plan = self.jobs.len(); + + let mut jobs = self + .jobs + .into_iter() + .enumerate() + .map(|(idx, job)| (idx + 1, job)) + .filter(|(idx, _job)| { + // Filter out any jobs that have been done already + if let Some(start_after) = start_after_job_idx { + *idx > start_after + } else { + true + } + }) + .peekable(); + + let mut last_completed_job_idx = start_after_job_idx.unwrap_or(0); + let checkpoint_every: usize = import_config.import_job_checkpoint_threshold.into(); + + // Run import jobs concurrently up to the limit specified by the pageserver configuration. + // Note that we process completed futures in the oreder of insertion. This will be the + // building block for resuming imports across pageserver restarts or tenant migrations. + while last_completed_job_idx < jobs_in_plan { + tokio::select! { + permit = semaphore.clone().acquire_owned(), if jobs.peek().is_some() => { + let permit = permit.expect("never closed"); + let (job_idx, job) = jobs.next().expect("we peeked"); + + let job_timeline = timeline.clone(); + let ctx = ctx.detached_child(TaskKind::ImportPgdata, DownloadBehavior::Error); + + work.push_back(tokio::task::spawn(async move { + let _permit = permit; + let res = job.run(job_timeline, &ctx).await; + (job_idx, res) + })); + }, + maybe_complete_job_idx = work.next() => { + match maybe_complete_job_idx { + Some(Ok((job_idx, res))) => { + assert!(last_completed_job_idx.checked_add(1).unwrap() == job_idx); + + res?; + last_completed_job_idx = job_idx; + + if last_completed_job_idx % checkpoint_every == 0 { + let progress = ShardImportProgressV1 { + jobs: jobs_in_plan, + completed: last_completed_job_idx, + import_plan_hash, + }; + + storcon_client.put_timeline_import_status( + timeline.tenant_shard_id, + timeline.timeline_id, + timeline.generation, + ShardImportStatus::InProgress(Some(ShardImportProgress::V1(progress))) + ) + .await + .map_err(|_err| { + anyhow::anyhow!("Shut down while putting timeline import status") + })?; + } + }, + Some(Err(_)) => { + anyhow::bail!( + "import job panicked or cancelled" + ); + } + None => {} + } + } + } + } + + Ok(()) + } +} + // // dbdir iteration tools // @@ -512,6 +635,15 @@ struct ImportSingleKeyTask { buf: Bytes, } +impl Hash for ImportSingleKeyTask { + fn hash(&self, state: &mut H) { + let ImportSingleKeyTask { key, buf } = self; + + key.hash(state); + buf.hash(state); + } +} + impl ImportSingleKeyTask { fn new(key: Key, buf: Bytes) -> Self { ImportSingleKeyTask { key, buf } @@ -540,6 +672,20 @@ struct ImportRelBlocksTask { storage: RemoteStorageWrapper, } +impl Hash for ImportRelBlocksTask { + fn hash(&self, state: &mut H) { + let ImportRelBlocksTask { + shard_identity: _, + key_range, + path, + storage: _, + } = self; + + key_range.hash(state); + path.hash(state); + } +} + impl ImportRelBlocksTask { fn new( shard_identity: ShardIdentity, @@ -624,6 +770,19 @@ struct ImportSlruBlocksTask { storage: RemoteStorageWrapper, } +impl Hash for ImportSlruBlocksTask { + fn hash(&self, state: &mut H) { + let ImportSlruBlocksTask { + key_range, + path, + storage: _, + } = self; + + key_range.hash(state); + path.hash(state); + } +} + impl ImportSlruBlocksTask { fn new(key_range: Range, path: &RemotePath, storage: RemoteStorageWrapper) -> Self { ImportSlruBlocksTask { @@ -666,6 +825,7 @@ impl ImportTask for ImportSlruBlocksTask { } } +#[derive(Hash)] enum AnyImportTask { SingleKey(ImportSingleKeyTask), RelBlocks(ImportRelBlocksTask), @@ -712,8 +872,8 @@ impl From for AnyImportTask { } } +#[derive(Hash)] struct ChunkProcessingJob { - timeline: Arc, range: Range, tasks: Vec, @@ -721,25 +881,24 @@ struct ChunkProcessingJob { } impl ChunkProcessingJob { - fn new(range: Range, tasks: Vec, env: &Flow) -> Self { - assert!(env.pgdata_lsn.is_valid()); + fn new(range: Range, tasks: Vec, pgdata_lsn: Lsn) -> Self { + assert!(pgdata_lsn.is_valid()); Self { - timeline: env.timeline.clone(), range, tasks, - pgdata_lsn: env.pgdata_lsn, + pgdata_lsn, } } - async fn run(self, ctx: &RequestContext) -> anyhow::Result<()> { + async fn run(self, timeline: Arc, ctx: &RequestContext) -> anyhow::Result<()> { let mut writer = ImageLayerWriter::new( - self.timeline.conf, - self.timeline.timeline_id, - self.timeline.tenant_shard_id, + timeline.conf, + timeline.timeline_id, + timeline.tenant_shard_id, &self.range, self.pgdata_lsn, - &self.timeline.gate, - self.timeline.cancel.clone(), + &timeline.gate, + timeline.cancel.clone(), ctx, ) .await?; @@ -751,24 +910,54 @@ impl ChunkProcessingJob { let resident_layer = if nimages > 0 { let (desc, path) = writer.finish(ctx).await?; - Layer::finish_creating(self.timeline.conf, &self.timeline, desc, &path)? + + { + let guard = timeline.layers.read().await; + let existing_layer = guard.try_get_from_key(&desc.key()); + if let Some(layer) = existing_layer { + if layer.metadata().generation != timeline.generation { + return Err(anyhow::anyhow!( + "Import attempted to rewrite layer file in the same generation: {}", + layer.local_path() + )); + } + } + } + + Layer::finish_creating(timeline.conf, &timeline, desc, &path)? } else { // dropping the writer cleans up return Ok(()); }; - // this is sharing the same code as create_image_layers - let mut guard = self.timeline.layers.write().await; - guard - .open_mut()? - .track_new_image_layers(&[resident_layer.clone()], &self.timeline.metrics); + // The same import job might run multiple times since not each job is checkpointed. + // Hence, we must support the cases where the layer already exists. We cannot be + // certain that the existing layer is identical to the new one, so in that case + // we replace the old layer with the one we just generated. + + let mut guard = timeline.layers.write().await; + + let existing_layer = guard + .try_get_from_key(&resident_layer.layer_desc().key()) + .cloned(); + match existing_layer { + Some(existing) => { + guard.open_mut()?.rewrite_layers( + &[(existing.clone(), resident_layer.clone())], + &[], + &timeline.metrics, + ); + } + None => { + guard + .open_mut()? + .track_new_image_layers(&[resident_layer.clone()], &timeline.metrics); + } + } + crate::tenant::timeline::drop_wlock(guard); - // Schedule the layer for upload but don't add barriers such as - // wait for completion or index upload, so we don't inhibit upload parallelism. - // TODO: limit upload parallelism somehow (e.g. by limiting concurrency of jobs?) - // TODO: or regulate parallelism by upload queue depth? Prob should happen at a higher level. - self.timeline + timeline .remote_client .schedule_layer_file_upload(resident_layer)?; diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs index e7aa8f6038..34313748b7 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs @@ -190,31 +190,6 @@ impl RemoteStorageWrapper { Ok(Some(res)) } - #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] - pub async fn put_json(&self, path: &RemotePath, value: &T) -> anyhow::Result<()> - where - T: serde::Serialize, - { - let buf = serde_json::to_vec(value)?; - let bytes = Bytes::from(buf); - utils::backoff::retry( - || async { - let size = bytes.len(); - let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone()))); - self.storage - .upload_storage_object(bytes, size, path, &self.cancel) - .await - }, - remote_storage::TimeoutOrCancel::caused_by_cancel, - 1, - u32::MAX, - &format!("put json {path}"), - &self.cancel, - ) - .await - .expect("practically infinite retries") - } - #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))] pub async fn get_range( &self, diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs index 57c647cc7f..d9f4da4748 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs @@ -5,9 +5,3 @@ pub struct PgdataStatus { pub done: bool, // TODO: remaining fields } - -#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)] -pub struct ShardStatus { - pub done: bool, - // TODO: remaining fields -} diff --git a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs index ea7a41b25f..371fc857dc 100644 --- a/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs +++ b/pageserver/src/tenant/timeline/import_pgdata/index_part_format.rs @@ -64,4 +64,12 @@ impl Root { }, } } + pub fn started_at(&self) -> &chrono::NaiveDateTime { + match self { + Root::V1(v1) => match v1 { + V1::InProgress(in_progress) => &in_progress.started_at, + V1::Done(done) => &done.started_at, + }, + } + } } diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs index 4f80073cc3..0f73eb839b 100644 --- a/pageserver/src/tenant/timeline/walreceiver.rs +++ b/pageserver/src/tenant/timeline/walreceiver.rs @@ -63,6 +63,7 @@ pub struct WalReceiver { /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token. /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`. cancel: CancellationToken, + task: tokio::task::JoinHandle<()>, } impl WalReceiver { @@ -79,7 +80,7 @@ impl WalReceiver { let loop_status = Arc::new(std::sync::RwLock::new(None)); let manager_status = Arc::clone(&loop_status); let cancel = timeline.cancel.child_token(); - WALRECEIVER_RUNTIME.spawn({ + let task = WALRECEIVER_RUNTIME.spawn({ let cancel = cancel.clone(); async move { debug_assert_current_span_has_tenant_and_timeline_id(); @@ -120,14 +121,25 @@ impl WalReceiver { Self { manager_status, cancel, + task, } } #[instrument(skip_all, level = tracing::Level::DEBUG)] - pub fn cancel(&self) { + pub async fn shutdown(self) { debug_assert_current_span_has_tenant_and_timeline_id(); debug!("cancelling walreceiver tasks"); self.cancel.cancel(); + match self.task.await { + Ok(()) => debug!("Shutdown success"), + Err(je) if je.is_cancelled() => unreachable!("not used"), + Err(je) if je.is_panic() => { + // already logged by panic hook + } + Err(je) => { + error!("shutdown walreceiver task join error: {je}") + } + } } pub(crate) fn status(&self) -> Option { diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index f429e59ef3..45b6e44c54 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -74,6 +74,8 @@ pub struct VirtualFile { impl VirtualFile { /// Open a file in read-only mode. Like File::open. + /// + /// Insensitive to `virtual_file_io_mode` setting. pub async fn open>( path: P, ctx: &RequestContext, @@ -95,31 +97,20 @@ impl VirtualFile { Self::open_with_options_v2(path.as_ref(), OpenOptions::new().read(true), ctx).await } + /// `O_DIRECT` will be enabled base on `virtual_file_io_mode`. pub async fn open_with_options_v2>( path: P, - #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut open_options: OpenOptions, + mut open_options: OpenOptions, ctx: &RequestContext, ) -> Result { let mode = get_io_mode(); - let set_o_direct = match (mode, open_options.is_write()) { + let direct = match (mode, open_options.is_write()) { (IoMode::Buffered, _) => false, - #[cfg(target_os = "linux")] (IoMode::Direct, false) => true, - #[cfg(target_os = "linux")] (IoMode::Direct, true) => false, - #[cfg(target_os = "linux")] (IoMode::DirectRw, _) => true, }; - if set_o_direct { - #[cfg(target_os = "linux")] - { - open_options = open_options.custom_flags(nix::libc::O_DIRECT); - } - #[cfg(not(target_os = "linux"))] - unreachable!( - "O_DIRECT is not supported on this platform, IoMode's that result in set_o_direct=true shouldn't even be defined" - ); - } + open_options = open_options.direct(direct); let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?; Ok(VirtualFile { inner, _mode: mode }) } @@ -417,7 +408,7 @@ impl OpenFiles { /// error types may be elegible for retry. pub(crate) fn is_fatal_io_error(e: &std::io::Error) -> bool { use nix::errno::Errno::*; - match e.raw_os_error().map(nix::errno::from_i32) { + match e.raw_os_error().map(nix::errno::Errno::from_raw) { Some(EIO) => { // Terminate on EIO because we no longer trust the device to store // data safely, or to uphold persistence guarantees on fsync. @@ -791,6 +782,12 @@ impl VirtualFileInner { where Buf: tokio_epoll_uring::IoBufMut + Send, { + self.validate_direct_io( + Slice::stable_ptr(&buf).addr(), + Slice::bytes_total(&buf), + offset, + ); + let file_guard = match self .lock_file() .await @@ -816,6 +813,8 @@ impl VirtualFileInner { offset: u64, ctx: &RequestContext, ) -> (FullSlice, Result) { + self.validate_direct_io(buf.as_ptr().addr(), buf.len(), offset); + let file_guard = match self.lock_file().await { Ok(file_guard) => file_guard, Err(e) => return (buf, Err(e)), @@ -830,6 +829,64 @@ impl VirtualFileInner { (buf, result) }) } + + /// Validate all reads and writes to adhere to the O_DIRECT requirements of our production systems. + /// + /// Validating it iin userspace sets a consistent bar, independent of what actual OS/filesystem/block device is in use. + fn validate_direct_io(&self, addr: usize, size: usize, offset: u64) { + // TODO: eventually enable validation in the builds we use in real environments like staging, preprod, and prod. + if !(cfg!(feature = "testing") || cfg!(test)) { + return; + } + if !self.open_options.is_direct() { + return; + } + + // Validate buffer memory alignment. + // + // What practically matters as of Linux 6.1 is bdev_dma_alignment() + // which is practically between 512 and 4096. + // On our production systems, the value is 512. + // The IoBuffer/IoBufferMut hard-code that value. + // + // Because the alloctor might return _more_ aligned addresses than requested, + // there is a chance that testing would not catch violations of a runtime requirement stricter than 512. + { + let requirement = 512; + let remainder = addr % requirement; + assert!( + remainder == 0, + "Direct I/O buffer must be aligned: buffer_addr=0x{addr:x} % 0x{requirement:x} = 0x{remainder:x}" + ); + } + + // Validate offset alignment. + // + // We hard-code 512 throughout the code base. + // So enforce just that and not anything more restrictive. + // Even the shallowest testing will expose more restrictive requirements if those ever arise. + { + let requirement = 512; + let remainder = offset % requirement; + assert!( + remainder == 0, + "Direct I/O offset must be aligned: offset=0x{offset:x} % 0x{requirement:x} = 0x{remainder:x}" + ); + } + + // Validate buffer size multiple requirement. + // + // The requirement in Linux 6.1 is bdev_logical_block_size(). + // On our production systems, that is 512. + { + let requirement = 512; + let remainder = size % requirement; + assert!( + remainder == 0, + "Direct I/O buffer size must be a multiple of {requirement}: size=0x{size:x} % 0x{requirement:x} = 0x{remainder:x}" + ); + } + } } // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135 @@ -1218,7 +1275,6 @@ mod tests { use std::sync::Arc; use owned_buffers_io::io_buf_ext::IoBufExt; - use owned_buffers_io::slice::SliceMutExt; use rand::seq::SliceRandom; use rand::{Rng, thread_rng}; @@ -1226,162 +1282,38 @@ mod tests { use crate::context::DownloadBehavior; use crate::task_mgr::TaskKind; - enum MaybeVirtualFile { - VirtualFile(VirtualFile), - File(File), - } - - impl From for MaybeVirtualFile { - fn from(vf: VirtualFile) -> Self { - MaybeVirtualFile::VirtualFile(vf) - } - } - - impl MaybeVirtualFile { - async fn read_exact_at( - &self, - mut slice: tokio_epoll_uring::Slice, - offset: u64, - ctx: &RequestContext, - ) -> Result, Error> { - match self { - MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await, - MaybeVirtualFile::File(file) => { - let rust_slice: &mut [u8] = slice.as_mut_rust_slice_full_zeroed(); - file.read_exact_at(rust_slice, offset).map(|()| slice) - } - } - } - async fn write_all_at( - &self, - buf: FullSlice, - offset: u64, - ctx: &RequestContext, - ) -> Result<(), Error> { - match self { - MaybeVirtualFile::VirtualFile(file) => { - let (_buf, res) = file.write_all_at(buf, offset, ctx).await; - res - } - MaybeVirtualFile::File(file) => file.write_all_at(&buf[..], offset), - } - } - - // Helper function to slurp a portion of a file into a string - async fn read_string_at( - &mut self, - pos: u64, - len: usize, - ctx: &RequestContext, - ) -> Result { - let slice = IoBufferMut::with_capacity(len).slice_full(); - assert_eq!(slice.bytes_total(), len); - let slice = self.read_exact_at(slice, pos, ctx).await?; - let buf = slice.into_inner(); - assert_eq!(buf.len(), len); - - Ok(String::from_utf8(buf.to_vec()).unwrap()) - } - } - #[tokio::test] async fn test_virtual_files() -> anyhow::Result<()> { - // The real work is done in the test_files() helper function. This - // allows us to run the same set of tests against a native File, and - // VirtualFile. We trust the native Files and wouldn't need to test them, - // but this allows us to verify that the operations return the same - // results with VirtualFiles as with native Files. (Except that with - // native files, you will run out of file descriptors if the ulimit - // is low enough.) - struct A; - - impl Adapter for A { - async fn open( - path: Utf8PathBuf, - opts: OpenOptions, - ctx: &RequestContext, - ) -> Result { - let vf = VirtualFile::open_with_options_v2(&path, opts, ctx).await?; - Ok(MaybeVirtualFile::VirtualFile(vf)) - } - } - test_files::("virtual_files").await - } - - #[tokio::test] - async fn test_physical_files() -> anyhow::Result<()> { - struct B; - - impl Adapter for B { - async fn open( - path: Utf8PathBuf, - opts: OpenOptions, - _ctx: &RequestContext, - ) -> Result { - Ok(MaybeVirtualFile::File({ - let owned_fd = opts.open(path.as_std_path()).await?; - File::from(owned_fd) - })) - } - } - - test_files::("physical_files").await - } - - /// This is essentially a closure which returns a MaybeVirtualFile, but because rust edition - /// 2024 is not yet out with new lifetime capture or outlives rules, this is a async function - /// in trait which benefits from the new lifetime capture rules already. - trait Adapter { - async fn open( - path: Utf8PathBuf, - opts: OpenOptions, - ctx: &RequestContext, - ) -> Result; - } - - async fn test_files(testname: &str) -> anyhow::Result<()> - where - A: Adapter, - { let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); - let testdir = crate::config::PageServerConf::test_repo_dir(testname); + let testdir = crate::config::PageServerConf::test_repo_dir("test_virtual_files"); std::fs::create_dir_all(&testdir)?; + let zeropad512 = |content: &[u8]| { + let mut buf = IoBufferMut::with_capacity_zeroed(512); + buf[..content.len()].copy_from_slice(content); + buf.freeze().slice_len() + }; + let path_a = testdir.join("file_a"); - let mut file_a = A::open( + let file_a = VirtualFile::open_with_options_v2( path_a.clone(), OpenOptions::new() + .read(true) .write(true) + // set create & truncate flags to ensure when we trigger a reopen later in this test, + // the reopen_options must have masked out those flags; if they don't, then + // the after reopen we will fail to read the `content_a` that we write here. .create(true) - .truncate(true) - .to_owned(), + .truncate(true), &ctx, ) .await?; + let (_, res) = file_a.write_all_at(zeropad512(b"content_a"), 0, &ctx).await; + res?; - file_a - .write_all_at(IoBuffer::from(b"foobar").slice_len(), 0, &ctx) - .await?; - - // cannot read from a file opened in write-only mode - let _ = file_a.read_string_at(0, 1, &ctx).await.unwrap_err(); - - // Close the file and re-open for reading - let mut file_a = A::open(path_a, OpenOptions::new().read(true), &ctx).await?; - - // cannot write to a file opened in read-only mode - let _ = file_a - .write_all_at(IoBuffer::from(b"bar").slice_len(), 0, &ctx) - .await - .unwrap_err(); - - // Try simple read - assert_eq!("foobar", file_a.read_string_at(0, 6, &ctx).await?); - - // Create another test file, and try FileExt functions on it. let path_b = testdir.join("file_b"); - let mut file_b = A::open( + let file_b = VirtualFile::open_with_options_v2( path_b.clone(), OpenOptions::new() .read(true) @@ -1391,37 +1323,44 @@ mod tests { &ctx, ) .await?; - file_b - .write_all_at(IoBuffer::from(b"BAR").slice_len(), 3, &ctx) - .await?; - file_b - .write_all_at(IoBuffer::from(b"FOO").slice_len(), 0, &ctx) - .await?; + let (_, res) = file_b.write_all_at(zeropad512(b"content_b"), 0, &ctx).await; + res?; - assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA"); + let assert_first_512_eq = async |vfile: &VirtualFile, expect: &[u8]| { + let buf = vfile + .read_exact_at(IoBufferMut::with_capacity_zeroed(512).slice_full(), 0, &ctx) + .await + .unwrap(); + assert_eq!(&buf[..], &zeropad512(expect)[..]); + }; - // Open a lot of files, enough to cause some evictions. (Or to be precise, - // open the same file many times. The effect is the same.) + // Open a lot of file descriptors / VirtualFile instances. + // Enough to cause some evictions in the fd cache. - let mut vfiles = Vec::new(); + let mut file_b_dupes = Vec::new(); for _ in 0..100 { - let mut vfile = A::open(path_b.clone(), OpenOptions::new().read(true), &ctx).await?; - assert_eq!("FOOBAR", vfile.read_string_at(0, 6, &ctx).await?); - vfiles.push(vfile); + let vfile = VirtualFile::open_with_options_v2( + path_b.clone(), + OpenOptions::new().read(true), + &ctx, + ) + .await?; + assert_first_512_eq(&vfile, b"content_b").await; + file_b_dupes.push(vfile); } // make sure we opened enough files to definitely cause evictions. - assert!(vfiles.len() > TEST_MAX_FILE_DESCRIPTORS * 2); + assert!(file_b_dupes.len() > TEST_MAX_FILE_DESCRIPTORS * 2); // The underlying file descriptor for 'file_a' should be closed now. Try to read - // from it again. - assert_eq!("foobar", file_a.read_string_at(0, 6, &ctx).await?); + // from it again. The VirtualFile reopens the file internally. + assert_first_512_eq(&file_a, b"content_a").await; // Check that all the other FDs still work too. Use them in random order for // good measure. - vfiles.as_mut_slice().shuffle(&mut thread_rng()); - for vfile in vfiles.iter_mut() { - assert_eq!("OOBAR", vfile.read_string_at(1, 5, &ctx).await?); + file_b_dupes.as_mut_slice().shuffle(&mut thread_rng()); + for vfile in file_b_dupes.iter_mut() { + assert_first_512_eq(vfile, b"content_b").await; } Ok(()) @@ -1452,7 +1391,7 @@ mod tests { // Open the file many times. let mut files = Vec::new(); for _ in 0..VIRTUAL_FILES { - let f = VirtualFileInner::open_with_options( + let f = VirtualFile::open_with_options_v2( &test_file_path, OpenOptions::new().read(true), &ctx, @@ -1497,8 +1436,6 @@ mod tests { #[tokio::test] async fn test_atomic_overwrite_basic() { - let ctx = - RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_basic"); std::fs::create_dir_all(&testdir).unwrap(); @@ -1508,26 +1445,22 @@ mod tests { VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec()) .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); - let post = file.read_string_at(0, 3, &ctx).await.unwrap(); + + let post = std::fs::read_to_string(&path).unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); - drop(file); VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec()) .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); - let post = file.read_string_at(0, 3, &ctx).await.unwrap(); + + let post = std::fs::read_to_string(&path).unwrap(); assert_eq!(post, "bar"); assert!(!tmp_path.exists()); - drop(file); } #[tokio::test] async fn test_atomic_overwrite_preexisting_tmp() { - let ctx = - RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test(); let testdir = crate::config::PageServerConf::test_repo_dir("test_atomic_overwrite_preexisting_tmp"); std::fs::create_dir_all(&testdir).unwrap(); @@ -1542,10 +1475,8 @@ mod tests { .await .unwrap(); - let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap()); - let post = file.read_string_at(0, 3, &ctx).await.unwrap(); + let post = std::fs::read_to_string(&path).unwrap(); assert_eq!(post, "foo"); assert!(!tmp_path.exists()); - drop(file); } } diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index dd04fb561a..3cde34eda7 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -111,18 +111,20 @@ pub(crate) fn get() -> IoEngine { use std::os::unix::prelude::FileExt; use std::sync::atomic::{AtomicU8, Ordering}; +#[cfg(target_os = "linux")] +use {std::time::Duration, tracing::info}; use super::owned_buffers_io::io_buf_ext::FullSlice; use super::owned_buffers_io::slice::SliceMutExt; use super::{FileGuard, Metadata}; #[cfg(target_os = "linux")] -fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std::io::Error { +pub(super) fn epoll_uring_error_to_std( + e: tokio_epoll_uring::Error, +) -> std::io::Error { match e { tokio_epoll_uring::Error::Op(e) => e, - tokio_epoll_uring::Error::System(system) => { - std::io::Error::new(std::io::ErrorKind::Other, system) - } + tokio_epoll_uring::Error::System(system) => std::io::Error::other(system), } } @@ -149,7 +151,11 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.read(file_guard, offset, slice).await; + let (resources, res) = + retry_ecanceled_once((file_guard, slice), |(file_guard, slice)| async { + system.read(file_guard, offset, slice).await + }) + .await; (resources, res.map_err(epoll_uring_error_to_std)) } } @@ -164,7 +170,10 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.fsync(file_guard).await; + let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async { + system.fsync(file_guard).await + }) + .await; (resources, res.map_err(epoll_uring_error_to_std)) } } @@ -182,7 +191,10 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.fdatasync(file_guard).await; + let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async { + system.fdatasync(file_guard).await + }) + .await; (resources, res.map_err(epoll_uring_error_to_std)) } } @@ -201,7 +213,10 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.statx(file_guard).await; + let (resources, res) = retry_ecanceled_once(file_guard, |file_guard| async { + system.statx(file_guard).await + }) + .await; ( resources, res.map_err(epoll_uring_error_to_std).map(Metadata::from), @@ -224,6 +239,7 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { // TODO: ftruncate op for tokio-epoll-uring + // Don't forget to use retry_ecanceled_once let res = file_guard.with_std_file(|std_file| std_file.set_len(len)); (file_guard, res) } @@ -245,8 +261,11 @@ impl IoEngine { #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let ((file_guard, slice), res) = - system.write(file_guard, offset, buf.into_raw_slice()).await; + let ((file_guard, slice), res) = retry_ecanceled_once( + (file_guard, buf.into_raw_slice()), + async |(file_guard, buf)| system.write(file_guard, offset, buf).await, + ) + .await; ( (file_guard, FullSlice::must_new(slice)), res.map_err(epoll_uring_error_to_std), @@ -282,6 +301,56 @@ impl IoEngine { } } +/// We observe in tests that stop pageserver with SIGTERM immediately after it was ingesting data, +/// occasionally buffered writers fail (and get retried by BufferedWriter) with ECANCELED. +/// The problem is believed to be a race condition in how io_uring handles punted async work (io-wq) and signals. +/// Investigation ticket: +/// +/// This function retries the operation once if it fails with ECANCELED. +/// ONLY USE FOR IDEMPOTENT [`super::VirtualFile`] operations. +#[cfg(target_os = "linux")] +pub(super) async fn retry_ecanceled_once( + resources: T, + f: F, +) -> (T, Result>) +where + F: Fn(T) -> Fut, + Fut: std::future::Future>)>, + T: Send, + V: Send, +{ + let (resources, res) = f(resources).await; + let Err(e) = res else { + return (resources, res); + }; + let tokio_epoll_uring::Error::Op(err) = e else { + return (resources, Err(e)); + }; + if err.raw_os_error() != Some(nix::libc::ECANCELED) { + return (resources, Err(tokio_epoll_uring::Error::Op(err))); + } + { + static RATE_LIMIT: std::sync::Mutex = + std::sync::Mutex::new(utils::rate_limit::RateLimit::new(Duration::from_secs(1))); + let mut guard = RATE_LIMIT.lock().unwrap(); + guard.call2(|rate_limit_stats| { + info!( + %rate_limit_stats, "ECANCELED observed, assuming it is due to a signal being received by the submitting thread, retrying after a delay; this message is rate-limited" + ); + }); + drop(guard); + } + tokio::time::sleep(Duration::from_millis(100)).await; // something big enough to beat even heavily overcommitted CI runners + let (resources, res) = f(resources).await; + (resources, res) +} + +pub(super) fn panic_operation_must_be_idempotent() { + panic!( + "unsupported; io_engine may retry operations internally and thus needs them to be idempotent (retry_ecanceled_once)" + ) +} + pub enum FeatureTestResult { PlatformPreferred(IoEngineKind), Worse { diff --git a/pageserver/src/virtual_file/open_options.rs b/pageserver/src/virtual_file/open_options.rs index 2a7bb693f2..7d478f3600 100644 --- a/pageserver/src/virtual_file/open_options.rs +++ b/pageserver/src/virtual_file/open_options.rs @@ -8,7 +8,13 @@ use super::io_engine::IoEngine; #[derive(Debug, Clone)] pub struct OpenOptions { + /// We keep a copy of the write() flag we pass to the `inner`` `OptionOptions` + /// to support [`Self::is_write`]. write: bool, + /// We don't expose + pass through a raw `custom_flags()` style API. + /// The only custom flag we support is `O_DIRECT`, which we track here + /// and map to `custom_flags()` in the [`Self::open`] method. + direct: bool, inner: Inner, } #[derive(Debug, Clone)] @@ -30,6 +36,7 @@ impl Default for OpenOptions { }; Self { write: false, + direct: false, inner, } } @@ -44,6 +51,10 @@ impl OpenOptions { self.write } + pub(super) fn is_direct(&self) -> bool { + self.direct + } + pub fn read(mut self, read: bool) -> Self { match &mut self.inner { Inner::StdFs(x) => { @@ -110,18 +121,48 @@ impl OpenOptions { self } + /// Don't use, `O_APPEND` is not supported. + pub fn append(&mut self, _append: bool) { + super::io_engine::panic_operation_must_be_idempotent(); + } + pub(in crate::virtual_file) async fn open(&self, path: &Path) -> std::io::Result { - match &self.inner { - Inner::StdFs(x) => x.open(path).map(|file| file.into()), + #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] + let mut custom_flags = 0; + if self.direct { #[cfg(target_os = "linux")] - Inner::TokioEpollUring(x) => { + { + custom_flags |= nix::libc::O_DIRECT; + } + #[cfg(not(target_os = "linux"))] + { + // Other platforms may be used for development but don't necessarily have a 1:1 equivalent to Linux's O_DIRECT (macOS!). + // Just don't set the flag; to catch alignment bugs typical for O_DIRECT, + // we have a runtime validation layer inside `VirtualFile::write_at` and `VirtualFile::read_at`. + static WARNING: std::sync::Once = std::sync::Once::new(); + WARNING.call_once(|| { + let span = tracing::info_span!(parent: None, "open_options"); + let _enter = span.enter(); + tracing::warn!("your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs; this warning is logged once per process"); + }); + } + } + + match self.inner.clone() { + Inner::StdFs(mut x) => x + .custom_flags(custom_flags) + .open(path) + .map(|file| file.into()), + #[cfg(target_os = "linux")] + Inner::TokioEpollUring(mut x) => { + x.custom_flags(custom_flags); let system = super::io_engine::tokio_epoll_uring_ext::thread_local_system().await; - system.open(path, x).await.map_err(|e| match e { - tokio_epoll_uring::Error::Op(e) => e, - tokio_epoll_uring::Error::System(system) => { - std::io::Error::new(std::io::ErrorKind::Other, system) - } + let (_, res) = super::io_engine::retry_ecanceled_once((), |()| async { + let res = system.open(path, &x).await; + ((), res) }) + .await; + res.map_err(super::io_engine::epoll_uring_error_to_std) } } } @@ -139,16 +180,8 @@ impl OpenOptions { self } - pub fn custom_flags(mut self, flags: i32) -> Self { - match &mut self.inner { - Inner::StdFs(x) => { - let _ = x.custom_flags(flags); - } - #[cfg(target_os = "linux")] - Inner::TokioEpollUring(x) => { - let _ = x.custom_flags(flags); - } - } + pub fn direct(mut self, direct: bool) -> Self { + self.direct = direct; self } } diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs index b41a9f6cd2..ac9867e8b4 100644 --- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs +++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs @@ -247,6 +247,19 @@ pub enum FlushTaskError { Cancelled, } +impl FlushTaskError { + pub fn is_cancel(&self) -> bool { + match self { + FlushTaskError::Cancelled => true, + } + } + pub fn into_anyhow(self) -> anyhow::Error { + match self { + FlushTaskError::Cancelled => anyhow::anyhow!(self), + } + } +} + impl FlushBackgroundTask where Buf: IoBufAligned + Send + Sync, diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c index 818a149499..9609f186b9 100644 --- a/pgxn/neon/communicator.c +++ b/pgxn/neon/communicator.c @@ -425,15 +425,12 @@ compact_prefetch_buffers(void) * point inside and outside PostgreSQL. * * This still does throw errors when it receives malformed responses from PS. - * - * When we're not called from CHECK_FOR_INTERRUPTS (indicated by - * IsHandlingInterrupts) we also report we've ended prefetch receive work, - * just in case state tracking was lost due to an error in the sync getPage - * response code. */ void -communicator_prefetch_pump_state(bool IsHandlingInterrupts) +communicator_prefetch_pump_state(void) { + START_PREFETCH_RECEIVE_WORK(); + while (MyPState->ring_receive != MyPState->ring_flush) { NeonResponse *response; @@ -482,9 +479,7 @@ communicator_prefetch_pump_state(bool IsHandlingInterrupts) } } - /* We never pump the prefetch state while handling other pages */ - if (!IsHandlingInterrupts) - END_PREFETCH_RECEIVE_WORK(); + END_PREFETCH_RECEIVE_WORK(); communicator_reconfigure_timeout_if_needed(); } @@ -672,9 +667,10 @@ prefetch_wait_for(uint64 ring_index) Assert(MyPState->ring_unused > ring_index); + START_PREFETCH_RECEIVE_WORK(); + while (MyPState->ring_receive <= ring_index) { - START_PREFETCH_RECEIVE_WORK(); entry = GetPrfSlot(MyPState->ring_receive); Assert(entry->status == PRFS_REQUESTED); @@ -683,17 +679,18 @@ prefetch_wait_for(uint64 ring_index) result = false; break; } - - END_PREFETCH_RECEIVE_WORK(); CHECK_FOR_INTERRUPTS(); } + if (result) { /* Check that slot is actually received (srver can be disconnected in prefetch_pump_state called from CHECK_FOR_INTERRUPTS */ PrefetchRequest *slot = GetPrfSlot(ring_index); - return slot->status == PRFS_RECEIVED; + result = slot->status == PRFS_RECEIVED; } - return false; + END_PREFETCH_RECEIVE_WORK(); + + return result; ; } @@ -720,6 +717,7 @@ prefetch_read(PrefetchRequest *slot) Assert(slot->status == PRFS_REQUESTED); Assert(slot->response == NULL); Assert(slot->my_ring_index == MyPState->ring_receive); + Assert(readpage_reentrant_guard); if (slot->status != PRFS_REQUESTED || slot->response != NULL || @@ -802,6 +800,7 @@ communicator_prefetch_receive(BufferTag tag) PrfHashEntry *entry; PrefetchRequest hashkey; + Assert(readpage_reentrant_guard); hashkey.buftag = tag; entry = prfh_lookup(MyPState->prf_hash, &hashkey); if (entry != NULL && prefetch_wait_for(entry->slot->my_ring_index)) @@ -821,8 +820,12 @@ communicator_prefetch_receive(BufferTag tag) void prefetch_on_ps_disconnect(void) { + bool save_readpage_reentrant_guard = readpage_reentrant_guard; MyPState->ring_flush = MyPState->ring_unused; + /* Prohibit callig of prefetch_pump_state */ + START_PREFETCH_RECEIVE_WORK(); + while (MyPState->ring_receive < MyPState->ring_unused) { PrefetchRequest *slot; @@ -851,6 +854,9 @@ prefetch_on_ps_disconnect(void) MyNeonCounters->getpage_prefetch_discards_total += 1; } + /* Restore guard */ + readpage_reentrant_guard = save_readpage_reentrant_guard; + /* * We can have gone into retry due to network error, so update stats with * the latest available @@ -2509,7 +2515,7 @@ communicator_processinterrupts(void) if (timeout_signaled) { if (!readpage_reentrant_guard && readahead_getpage_pull_timeout_ms > 0) - communicator_prefetch_pump_state(true); + communicator_prefetch_pump_state(); timeout_signaled = false; communicator_reconfigure_timeout_if_needed(); diff --git a/pgxn/neon/communicator.h b/pgxn/neon/communicator.h index f55c4b10f1..5376c9b839 100644 --- a/pgxn/neon/communicator.h +++ b/pgxn/neon/communicator.h @@ -44,7 +44,7 @@ extern int communicator_read_slru_segment(SlruKind kind, int64 segno, void *buffer); extern void communicator_reconfigure_timeout_if_needed(void); -extern void communicator_prefetch_pump_state(bool IsHandlingInterrupts); +extern void communicator_prefetch_pump_state(void); #endif diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index ecc55bb540..176fd9643f 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -936,6 +936,44 @@ lfc_prewarm_main(Datum main_arg) lfc_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp(); } +void +lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks) +{ + BufferTag tag; + FileCacheEntry *entry; + uint32 hash; + + if (lfc_maybe_disabled()) /* fast exit if file cache is disabled */ + return; + + CopyNRelFileInfoToBufTag(tag, rinfo); + tag.forkNum = forkNum; + + CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber); + + LWLockAcquire(lfc_lock, LW_EXCLUSIVE); + if (LFC_ENABLED()) + { + for (BlockNumber blkno = 0; blkno < nblocks; blkno += lfc_blocks_per_chunk) + { + tag.blockNum = blkno; + hash = get_hash_value(lfc_hash, &tag); + entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL); + if (entry != NULL) + { + for (int i = 0; i < lfc_blocks_per_chunk; i++) + { + if (GET_STATE(entry, i) == AVAILABLE) + { + lfc_ctl->used_pages -= 1; + SET_STATE(entry, i, UNAVAILABLE); + } + } + } + } + } + LWLockRelease(lfc_lock); +} /* * Check if page is present in the cache. diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h index c7b6b09f72..d5ac55d5ba 100644 --- a/pgxn/neon/file_cache.h +++ b/pgxn/neon/file_cache.h @@ -28,6 +28,7 @@ typedef struct FileCacheState extern bool lfc_store_prefetch_result; /* functions for local file cache */ +extern void lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks); extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, const void *const *buffers, BlockNumber nblocks); diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index ee4e6ccc5b..3b6c4247c3 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -433,7 +433,6 @@ pageserver_connect(shardno_t shard_no, int elevel) now = GetCurrentTimestamp(); us_since_last_attempt = (int64) (now - shard->last_reconnect_time); - shard->last_reconnect_time = now; /* * Make sure we don't do exponential backoff with a constant multiplier @@ -447,14 +446,23 @@ pageserver_connect(shardno_t shard_no, int elevel) /* * If we did other tasks between reconnect attempts, then we won't * need to wait as long as a full delay. + * + * This is a loop to protect against interrupted sleeps. */ - if (us_since_last_attempt < shard->delay_us) + while (us_since_last_attempt < shard->delay_us) { pg_usleep(shard->delay_us - us_since_last_attempt); + + /* At least we should handle cancellations here */ + CHECK_FOR_INTERRUPTS(); + + now = GetCurrentTimestamp(); + us_since_last_attempt = (int64) (now - shard->last_reconnect_time); } /* update the delay metric */ shard->delay_us = Min(shard->delay_us * 2, MAX_RECONNECT_INTERVAL_USEC); + shard->last_reconnect_time = now; /* * Connect using the connection string we got from the diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h index b3ed0c04e8..bf91a02b45 100644 --- a/pgxn/neon/neon_pgversioncompat.h +++ b/pgxn/neon/neon_pgversioncompat.h @@ -86,7 +86,7 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode, #define InvalidRelFileNumber InvalidOid -#define SMgrRelGetRelInfo(reln) \ +#define SMgrRelGetRelInfo(reln) \ (reln->smgr_rnode.node) #define DropRelationAllLocalBuffers DropRelFileNodeAllLocalBuffers @@ -148,6 +148,12 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode, #define DropRelationAllLocalBuffers DropRelationAllLocalBuffers #endif +#define NRelFileInfoInvalidate(rinfo) do { \ + NInfoGetSpcOid(rinfo) = InvalidOid; \ + NInfoGetDbOid(rinfo) = InvalidOid; \ + NInfoGetRelNumber(rinfo) = InvalidRelFileNumber; \ + } while (0) + #if PG_MAJORVERSION_NUM < 17 #define ProcNumber BackendId #define INVALID_PROC_NUMBER InvalidBackendId diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c index 87eb420717..43fd715bbb 100644 --- a/pgxn/neon/pagestore_smgr.c +++ b/pgxn/neon/pagestore_smgr.c @@ -108,7 +108,7 @@ typedef enum UNLOGGED_BUILD_NOT_PERMANENT } UnloggedBuildPhase; -static SMgrRelation unlogged_build_rel = NULL; +static NRelFileInfo unlogged_build_rel_info; static UnloggedBuildPhase unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; static bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id); @@ -912,16 +912,19 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, { case 0: neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence"); + break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdextend(reln, forkNum, blkno, buffer, skipFsync); + return; + } break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdextend(reln, forkNum, blkno, buffer, skipFsync); - /* Update LFC in case of unlogged index build */ - if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) - lfc_write(InfoFromSMgrRel(reln), forkNum, blkno, buffer); return; default: @@ -1003,21 +1006,19 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum, { case 0: neon_log(ERROR, "cannot call smgrextend() on rel with unknown persistence"); + break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); + return; + } break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdzeroextend(reln, forkNum, blocknum, nblocks, skipFsync); - /* Update LFC in case of unlogged index build */ - if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) - { - for (int i = 0; i < nblocks; i++) - { - lfc_write(InfoFromSMgrRel(reln), forkNum, blocknum + i, buffer.data); - } - } return; default: @@ -1179,7 +1180,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, blocknum += iterblocks; } - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); return false; } @@ -1218,7 +1219,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) communicator_prefetch_register_bufferv(tag, NULL, 1, NULL); - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); return false; } @@ -1262,7 +1263,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum, */ neon_log(SmgrTrace, "writeback noop"); - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1281,75 +1282,24 @@ neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, communicator_read_at_lsnv(rinfo, forkNum, blkno, &request_lsns, &buffer, 1, NULL); } -#if PG_MAJORVERSION_NUM < 17 -/* - * neon_read() -- Read the specified block from a relation. - */ -#if PG_MAJORVERSION_NUM < 16 -static void -neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer) -#else -static void -neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) -#endif -{ - neon_request_lsns request_lsns; - bits8 present; - void *bufferp; - - switch (reln->smgr_relpersistence) - { - case 0: - neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); - - case RELPERSISTENCE_PERMANENT: - break; - - case RELPERSISTENCE_TEMP: - case RELPERSISTENCE_UNLOGGED: - mdread(reln, forkNum, blkno, buffer); - return; - - default: - neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); - } - - /* Try to read PS results if they are available */ - communicator_prefetch_pump_state(false); - - neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); - - present = 0; - bufferp = buffer; - if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present)) - { - /* Prefetch hit */ - return; - } - - /* Try to read from local file cache */ - if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer)) - { - MyNeonCounters->file_cache_hits_total++; - return; - } - - neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); - - /* - * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. - */ - communicator_prefetch_pump_state(false); - #ifdef DEBUG_COMPARE_LOCAL +static void +compare_with_local(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void* buffer, XLogRecPtr request_lsn) +{ if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) { char pageserver_masked[BLCKSZ]; PGIOAlignedBlock mdbuf; PGIOAlignedBlock mdbuf_masked; - XLogRecPtr request_lsn = request_lsns.request_lsn; +#if PG_MAJORVERSION_NUM >= 17 + { + void* mdbuffers[1] = { mdbuf.data }; + mdreadv(reln, forkNum, blkno, mdbuffers, 1); + } +#else mdread(reln, forkNum, blkno, mdbuf.data); +#endif memcpy(pageserver_masked, buffer, BLCKSZ); memcpy(mdbuf_masked.data, mdbuf.data, BLCKSZ); @@ -1413,11 +1363,111 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer } } } +} +#endif + + +#if PG_MAJORVERSION_NUM < 17 + +/* + * neon_read() -- Read the specified block from a relation. + */ +#if PG_MAJORVERSION_NUM < 16 +static void +neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, char *buffer) +#else +static void +neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer) +#endif +{ + neon_request_lsns request_lsns; + bits8 present; + void *bufferp; + + switch (reln->smgr_relpersistence) + { + case 0: + neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); + break; + + case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdread(reln, forkNum, blkno, buffer); + return; + } + break; + + case RELPERSISTENCE_TEMP: + case RELPERSISTENCE_UNLOGGED: + mdread(reln, forkNum, blkno, buffer); + return; + + default: + neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); + } + + /* Try to read PS results if they are available */ + communicator_prefetch_pump_state(); + + neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1); + + present = 0; + bufferp = buffer; + if (communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, &bufferp, &present)) + { + /* Prefetch hit */ +#ifdef DEBUG_COMPARE_LOCAL + compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); +#else + return; +#endif + } + + /* Try to read from local file cache */ + if (lfc_read(InfoFromSMgrRel(reln), forkNum, blkno, buffer)) + { + MyNeonCounters->file_cache_hits_total++; +#ifdef DEBUG_COMPARE_LOCAL + compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); +#else + return; +#endif + } + + neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer); + + /* + * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. + */ + communicator_prefetch_pump_state(); + +#ifdef DEBUG_COMPARE_LOCAL + compare_with_local(reln, forkNum, blkno, buffer, request_lsns.request_lsn); #endif } #endif /* PG_MAJORVERSION_NUM <= 16 */ #if PG_MAJORVERSION_NUM >= 17 + +#ifdef DEBUG_COMPARE_LOCAL +static void +compare_with_localv(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void** buffers, BlockNumber nblocks, neon_request_lsns* request_lsns, bits8* read_pages) +{ + if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) + { + for (BlockNumber i = 0; i < nblocks; i++) + { + if (BITMAP_ISSET(read_pages, i)) + { + compare_with_local(reln, forkNum, blkno + i, buffers[i], request_lsns[i].request_lsn); + } + } + } +} +#endif + + static void neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks) @@ -1431,8 +1481,14 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, { case 0: neon_log(ERROR, "cannot call smgrread() on rel with unknown persistence"); + break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdreadv(reln, forknum, blocknum, buffers, nblocks); + return; + } break; case RELPERSISTENCE_TEMP: @@ -1449,7 +1505,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, nblocks, PG_IOV_MAX); /* Try to read PS results if they are available */ - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks); @@ -1460,8 +1516,13 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, blocknum, request_lsns, nblocks, buffers, read_pages); +#ifdef DEBUG_COMPARE_LOCAL + compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); + memset(read_pages, 0, sizeof(read_pages)); +#else if (prefetch_result == nblocks) return; +#endif /* Try to read from local file cache */ lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers, @@ -1470,9 +1531,14 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, if (lfc_result > 0) MyNeonCounters->file_cache_hits_total += lfc_result; +#ifdef DEBUG_COMPARE_LOCAL + compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); + memset(read_pages, 0, sizeof(read_pages)); +#else /* Read all blocks from LFC, so we're done */ if (prefetch_result + lfc_result == nblocks) return; +#endif communicator_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, buffers, nblocks, read_pages); @@ -1480,94 +1546,11 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, /* * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes. */ - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL - if (forknum == MAIN_FORKNUM && IS_LOCAL_REL(reln)) - { - char pageserver_masked[BLCKSZ]; - PGIOAlignedBlock mdbuf; - PGIOAlignedBlock mdbuf_masked; - XLogRecPtr request_lsn = request_lsns->request_lsn; - - for (int i = 0; i < nblocks; i++) - { - BlockNumber blkno = blocknum + i; - if (!BITMAP_ISSET(read_pages, i)) - continue; - -#if PG_MAJORVERSION_NUM >= 17 - { - void* mdbuffers[1] = { mdbuf.data }; - mdreadv(reln, forknum, blkno, mdbuffers, 1); - } -#else - mdread(reln, forknum, blkno, mdbuf.data); -#endif - - memcpy(pageserver_masked, buffers[i], BLCKSZ); - memcpy(mdbuf_masked.data, mdbuf.data, BLCKSZ); - - if (PageIsNew((Page) mdbuf.data)) - { - if (!PageIsNew((Page) pageserver_masked)) - { - neon_log(PANIC, "page is new in MD but not in Page Server at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", - blkno, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(buffers[i])); - } - } - else if (PageIsNew((Page) buffers[i])) - { - neon_log(PANIC, "page is new in Page Server but not in MD at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n%s\n", - blkno, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf.data)); - } - else if (PageGetSpecialSize(mdbuf.data) == 0) - { - /* assume heap */ - RmgrTable[RM_HEAP_ID].rm_mask(mdbuf_masked.data, blkno); - RmgrTable[RM_HEAP_ID].rm_mask(pageserver_masked, blkno); - - if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) - { - neon_log(PANIC, "heap buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", - blkno, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked.data), - hexdump_page(pageserver_masked)); - } - } - else if (PageGetSpecialSize(mdbuf.data) == MAXALIGN(sizeof(BTPageOpaqueData))) - { - if (((BTPageOpaqueData *) PageGetSpecialPointer(mdbuf.data))->btpo_cycleid < MAX_BT_CYCLE_ID) - { - /* assume btree */ - RmgrTable[RM_BTREE_ID].rm_mask(mdbuf_masked.data, blkno); - RmgrTable[RM_BTREE_ID].rm_mask(pageserver_masked, blkno); - - if (memcmp(mdbuf_masked.data, pageserver_masked, BLCKSZ) != 0) - { - neon_log(PANIC, "btree buffers differ at blk %u in rel %u/%u/%u fork %u (request LSN %X/%08X):\n------ MD ------\n%s\n------ Page Server ------\n%s\n", - blkno, - RelFileInfoFmt(InfoFromSMgrRel(reln)), - forknum, - (uint32) (request_lsn >> 32), (uint32) request_lsn, - hexdump_page(mdbuf_masked.data), - hexdump_page(pageserver_masked)); - } - } - } - } - } + memset(read_pages, 0xFF, sizeof(read_pages)); + compare_with_localv(reln, forknum, blocknum, buffers, nblocks, request_lsns, read_pages); #endif } #endif @@ -1638,6 +1621,15 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { +#if PG_MAJORVERSION_NUM >= 17 + mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync); +#else + mdwrite(reln, forknum, blocknum, buffer, skipFsync); +#endif + return; + } break; case RELPERSISTENCE_TEMP: @@ -1647,9 +1639,6 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo #else mdwrite(reln, forknum, blocknum, buffer, skipFsync); #endif - /* Update LFC in case of unlogged index build */ - if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) - lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); @@ -1665,7 +1654,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer); - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1710,14 +1699,16 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); + return; + } break; case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: mdwritev(reln, forknum, blkno, buffers, nblocks, skipFsync); - /* Update LFC in case of unlogged index build */ - if (reln == unlogged_build_rel && unlogged_build_phase == UNLOGGED_BUILD_PHASE_2) - lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); return; default: neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence); @@ -1727,7 +1718,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks); - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1753,6 +1744,10 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum) break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + return mdnblocks(reln, forknum); + } break; case RELPERSISTENCE_TEMP: @@ -1822,6 +1817,11 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo break; case RELPERSISTENCE_PERMANENT: + if (RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))) + { + mdtruncate(reln, forknum, old_blocks, nblocks); + return; + } break; case RELPERSISTENCE_TEMP: @@ -1902,7 +1902,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum) neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop"); - communicator_prefetch_pump_state(false); + communicator_prefetch_pump_state(); #ifdef DEBUG_COMPARE_LOCAL if (IS_LOCAL_REL(reln)) @@ -1960,7 +1960,6 @@ neon_start_unlogged_build(SMgrRelation reln) */ if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) neon_log(ERROR, "unlogged relation build is already in progress"); - Assert(unlogged_build_rel == NULL); ereport(SmgrTrace, (errmsg(NEON_TAG "starting unlogged build of relation %u/%u/%u", @@ -1977,7 +1976,7 @@ neon_start_unlogged_build(SMgrRelation reln) case RELPERSISTENCE_TEMP: case RELPERSISTENCE_UNLOGGED: - unlogged_build_rel = reln; + unlogged_build_rel_info = InfoFromSMgrRel(reln); unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT; #ifdef DEBUG_COMPARE_LOCAL if (!IsParallelWorker()) @@ -1998,12 +1997,9 @@ neon_start_unlogged_build(SMgrRelation reln) neon_log(ERROR, "cannot perform unlogged index build, index is not empty "); #endif - unlogged_build_rel = reln; + unlogged_build_rel_info = InfoFromSMgrRel(reln); unlogged_build_phase = UNLOGGED_BUILD_PHASE_1; - /* Make the relation look like it's unlogged */ - reln->smgr_relpersistence = RELPERSISTENCE_UNLOGGED; - /* * Create the local file. In a parallel build, the leader is expected to * call this first and do it. @@ -2030,17 +2026,16 @@ neon_start_unlogged_build(SMgrRelation reln) static void neon_finish_unlogged_build_phase_1(SMgrRelation reln) { - Assert(unlogged_build_rel == reln); + Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))); ereport(SmgrTrace, (errmsg(NEON_TAG "finishing phase 1 of unlogged build of relation %u/%u/%u", - RelFileInfoFmt(InfoFromSMgrRel(reln))))); + RelFileInfoFmt((unlogged_build_rel_info))))); if (unlogged_build_phase == UNLOGGED_BUILD_NOT_PERMANENT) return; Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_1); - Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); /* * In a parallel build, (only) the leader process performs the 2nd @@ -2048,7 +2043,7 @@ neon_finish_unlogged_build_phase_1(SMgrRelation reln) */ if (IsParallelWorker()) { - unlogged_build_rel = NULL; + NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; } else @@ -2069,11 +2064,11 @@ neon_end_unlogged_build(SMgrRelation reln) { NRelFileInfoBackend rinfob = InfoBFromSMgrRel(reln); - Assert(unlogged_build_rel == reln); + Assert(RelFileInfoEquals(unlogged_build_rel_info, InfoFromSMgrRel(reln))); ereport(SmgrTrace, (errmsg(NEON_TAG "ending unlogged build of relation %u/%u/%u", - RelFileInfoFmt(InfoFromNInfoB(rinfob))))); + RelFileInfoFmt(unlogged_build_rel_info)))); if (unlogged_build_phase != UNLOGGED_BUILD_NOT_PERMANENT) { @@ -2081,7 +2076,6 @@ neon_end_unlogged_build(SMgrRelation reln) BlockNumber nblocks; Assert(unlogged_build_phase == UNLOGGED_BUILD_PHASE_2); - Assert(reln->smgr_relpersistence == RELPERSISTENCE_UNLOGGED); /* * Update the last-written LSN cache. @@ -2102,9 +2096,6 @@ neon_end_unlogged_build(SMgrRelation reln) InfoFromNInfoB(rinfob), MAIN_FORKNUM); - /* Make the relation look permanent again */ - reln->smgr_relpersistence = RELPERSISTENCE_PERMANENT; - /* Remove local copy */ for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) { @@ -2113,6 +2104,8 @@ neon_end_unlogged_build(SMgrRelation reln) forknum); forget_cached_relsize(InfoFromNInfoB(rinfob), forknum); + lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks); + mdclose(reln, forknum); #ifndef DEBUG_COMPARE_LOCAL /* use isRedo == true, so that we drop it immediately */ @@ -2123,7 +2116,7 @@ neon_end_unlogged_build(SMgrRelation reln) mdunlink(rinfob, INIT_FORKNUM, true); #endif } - unlogged_build_rel = NULL; + NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; } @@ -2196,7 +2189,7 @@ AtEOXact_neon(XactEvent event, void *arg) * Forget about any build we might have had in progress. The local * file will be unlinked by smgrDoPendingDeletes() */ - unlogged_build_rel = NULL; + NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; break; @@ -2208,7 +2201,7 @@ AtEOXact_neon(XactEvent event, void *arg) case XACT_EVENT_PRE_PREPARE: if (unlogged_build_phase != UNLOGGED_BUILD_NOT_IN_PROGRESS) { - unlogged_build_rel = NULL; + NRelFileInfoInvalidate(unlogged_build_rel_info); unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS; ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), diff --git a/poetry.lock b/poetry.lock index 1a772d3415..e6440761be 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -1145,18 +1145,19 @@ dotenv = ["python-dotenv"] [[package]] name = "flask-cors" -version = "5.0.0" -description = "A Flask extension adding a decorator for CORS support" +version = "6.0.0" +description = "A Flask extension simplifying CORS support" optional = false -python-versions = "*" +python-versions = "<4.0,>=3.9" groups = ["main"] files = [ - {file = "Flask_Cors-5.0.0-py2.py3-none-any.whl", hash = "sha256:b9e307d082a9261c100d8fb0ba909eec6a228ed1b60a8315fd85f783d61910bc"}, - {file = "flask_cors-5.0.0.tar.gz", hash = "sha256:5aadb4b950c4e93745034594d9f3ea6591f734bb3662e16e255ffbf5e89c88ef"}, + {file = "flask_cors-6.0.0-py3-none-any.whl", hash = "sha256:6332073356452343a8ccddbfec7befdc3fdd040141fe776ec9b94c262f058657"}, + {file = "flask_cors-6.0.0.tar.gz", hash = "sha256:4592c1570246bf7beee96b74bc0adbbfcb1b0318f6ba05c412e8909eceec3393"}, ] [package.dependencies] -Flask = ">=0.9" +flask = ">=0.9" +Werkzeug = ">=0.7" [[package]] name = "frozenlist" diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 183976374a..526d0df7f2 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -12,9 +12,9 @@ use tracing::{debug, warn}; use crate::auth::password_hack::parse_endpoint_param; use crate::context::RequestContext; use crate::error::{ReportableError, UserFacingError}; -use crate::metrics::{Metrics, SniKind}; +use crate::metrics::{Metrics, SniGroup, SniKind}; use crate::proxy::NeonOptions; -use crate::serverless::SERVERLESS_DRIVER_SNI; +use crate::serverless::{AUTH_BROKER_SNI, SERVERLESS_DRIVER_SNI}; use crate::types::{EndpointId, RoleName}; #[derive(Debug, Error, PartialEq, Eq, Clone)] @@ -65,7 +65,7 @@ pub(crate) fn endpoint_sni(sni: &str, common_names: &HashSet) -> Option< if !common_names.contains(common_name) { return None; } - if subdomain == SERVERLESS_DRIVER_SNI { + if subdomain == SERVERLESS_DRIVER_SNI || subdomain == AUTH_BROKER_SNI { return None; } Some(EndpointId::from(subdomain)) @@ -128,22 +128,23 @@ impl ComputeUserInfoMaybeEndpoint { let metrics = Metrics::get(); debug!(%user, "credentials"); - if sni.is_some() { + + let protocol = ctx.protocol(); + let kind = if sni.is_some() { debug!("Connection with sni"); - metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni); + SniKind::Sni } else if endpoint.is_some() { - metrics - .proxy - .accepted_connections_by_sni - .inc(SniKind::NoSni); debug!("Connection without sni"); + SniKind::NoSni } else { - metrics - .proxy - .accepted_connections_by_sni - .inc(SniKind::PasswordHack); debug!("Connection with password hack"); - } + SniKind::PasswordHack + }; + + metrics + .proxy + .accepted_connections_by_sni + .inc(SniGroup { protocol, kind }); let options = NeonOptions::parse_params(params); diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 7d4b44841d..d60d32eb3b 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -1,6 +1,10 @@ #[global_allocator] static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; +#[allow(non_upper_case_globals)] +#[unsafe(export_name = "malloc_conf")] +pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0"; + #[tokio::main] async fn main() -> anyhow::Result<()> { proxy::binary::proxy::run().await diff --git a/proxy/src/binary/local_proxy.rs b/proxy/src/binary/local_proxy.rs index ee7f6ffcd7..a566383390 100644 --- a/proxy/src/binary/local_proxy.rs +++ b/proxy/src/binary/local_proxy.rs @@ -423,8 +423,8 @@ async fn refresh_config_inner( if let Some(tls_config) = data.tls { let tls_config = tokio::task::spawn_blocking(move || { crate::tls::server_config::configure_tls( - &tls_config.key_path, - &tls_config.cert_path, + tls_config.key_path.as_ref(), + tls_config.cert_path.as_ref(), None, false, ) diff --git a/proxy/src/binary/pg_sni_router.rs b/proxy/src/binary/pg_sni_router.rs index 19be058ac3..3e87538ae7 100644 --- a/proxy/src/binary/pg_sni_router.rs +++ b/proxy/src/binary/pg_sni_router.rs @@ -1,8 +1,10 @@ -/// A stand-alone program that routes connections, e.g. from -/// `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. -/// -/// This allows connecting to pods/services running in the same Kubernetes cluster from -/// the outside. Similar to an ingress controller for HTTPS. +//! A stand-alone program that routes connections, e.g. from +//! `aaa--bbb--1234.external.domain` to `aaa.bbb.internal.domain:1234`. +//! +//! This allows connecting to pods/services running in the same Kubernetes cluster from +//! the outside. Similar to an ingress controller for HTTPS. + +use std::path::Path; use std::{net::SocketAddr, sync::Arc}; use anyhow::{Context, anyhow, bail, ensure}; @@ -86,46 +88,7 @@ pub async fn run() -> anyhow::Result<()> { args.get_one::("tls-key"), args.get_one::("tls-cert"), ) { - (Some(key_path), Some(cert_path)) => { - let key = { - let key_bytes = std::fs::read(key_path).context("TLS key file")?; - - let mut keys = - rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); - - ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); - PrivateKeyDer::Pkcs8( - keys.pop() - .expect("keys should not be empty") - .context(format!("Failed to read TLS keys at '{key_path}'"))?, - ) - }; - - let cert_chain_bytes = std::fs::read(cert_path) - .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; - - let cert_chain: Vec<_> = { - rustls_pemfile::certs(&mut &cert_chain_bytes[..]) - .try_collect() - .with_context(|| { - format!("Failed to read TLS certificate chain from bytes from file at '{cert_path}'.") - })? - }; - - // needed for channel bindings - let first_cert = cert_chain.first().context("missing certificate")?; - let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; - - let tls_config = - rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) - .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) - .context("ring should support TLS1.2 and TLS1.3")? - .with_no_client_auth() - .with_single_cert(cert_chain, key)? - .into(); - - (tls_config, tls_server_end_point) - } + (Some(key_path), Some(cert_path)) => parse_tls(key_path.as_ref(), cert_path.as_ref())?, _ => bail!("tls-key and tls-cert must be specified"), }; @@ -188,7 +151,58 @@ pub async fn run() -> anyhow::Result<()> { match signal {} } -async fn task_main( +pub(super) fn parse_tls( + key_path: &Path, + cert_path: &Path, +) -> anyhow::Result<(Arc, TlsServerEndPoint)> { + let key = { + let key_bytes = std::fs::read(key_path).context("TLS key file")?; + + let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..]).collect_vec(); + + ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len()); + PrivateKeyDer::Pkcs8( + keys.pop() + .expect("keys should not be empty") + .context(format!( + "Failed to read TLS keys at '{}'", + key_path.display() + ))?, + ) + }; + + let cert_chain_bytes = std::fs::read(cert_path).context(format!( + "Failed to read TLS cert file at '{}.'", + cert_path.display() + ))?; + + let cert_chain: Vec<_> = { + rustls_pemfile::certs(&mut &cert_chain_bytes[..]) + .try_collect() + .with_context(|| { + format!( + "Failed to read TLS certificate chain from bytes from file at '{}'.", + cert_path.display() + ) + })? + }; + + // needed for channel bindings + let first_cert = cert_chain.first().context("missing certificate")?; + let tls_server_end_point = TlsServerEndPoint::new(first_cert)?; + + let tls_config = + rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider())) + .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12]) + .context("ring should support TLS1.2 and TLS1.3")? + .with_no_client_auth() + .with_single_cert(cert_chain, key)? + .into(); + + Ok((tls_config, tls_server_end_point)) +} + +pub(super) async fn task_main( dest_suffix: Arc, tls_config: Arc, compute_tls_config: Option>, @@ -380,6 +394,7 @@ async fn handle_client( } } +#[allow(clippy::large_enum_variant)] enum Connection { Raw(tokio::net::TcpStream), Tls(tokio_rustls::client::TlsStream), diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs index e03f2f33d9..f40d5041c1 100644 --- a/proxy/src/binary/proxy.rs +++ b/proxy/src/binary/proxy.rs @@ -1,9 +1,10 @@ use std::net::SocketAddr; +use std::path::PathBuf; use std::pin::pin; use std::sync::Arc; use std::time::Duration; -use anyhow::bail; +use anyhow::{bail, ensure}; use arc_swap::ArcSwapOption; use futures::future::Either; use remote_storage::RemoteStorageConfig; @@ -42,11 +43,12 @@ project_build_tag!(BUILD_TAG); use clap::{Parser, ValueEnum}; #[derive(Clone, Debug, ValueEnum)] +#[clap(rename_all = "kebab-case")] enum AuthBackendType { - #[value(name("cplane-v1"), alias("control-plane"))] - ControlPlaneV1, + #[clap(alias("cplane-v1"))] + ControlPlane, - #[value(name("link"), alias("control-redirect"))] + #[clap(alias("link"))] ConsoleRedirect, #[cfg(any(test, feature = "testing"))] @@ -62,18 +64,18 @@ struct ProxyCliArgs { region: String, /// listen for incoming client connections on ip:port #[clap(short, long, default_value = "127.0.0.1:4432")] - proxy: String, + proxy: SocketAddr, #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)] auth_backend: AuthBackendType, /// listen for management callback connection on ip:port #[clap(short, long, default_value = "127.0.0.1:7000")] - mgmt: String, + mgmt: SocketAddr, /// listen for incoming http connections (metrics, etc) on ip:port #[clap(long, default_value = "127.0.0.1:7001")] - http: String, + http: SocketAddr, /// listen for incoming wss connections on ip:port #[clap(long)] - wss: Option, + wss: Option, /// redirect unauthenticated users to the given uri in case of console redirect auth #[clap(short, long, default_value = "http://localhost:3000/psql_session/")] uri: String, @@ -99,18 +101,18 @@ struct ProxyCliArgs { /// /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir #[clap(short = 'k', long, alias = "ssl-key")] - tls_key: Option, + tls_key: Option, /// path to TLS cert for client postgres connections /// /// tls-key and tls-cert are for backwards compatibility, we can put all certs in one dir #[clap(short = 'c', long, alias = "ssl-cert")] - tls_cert: Option, + tls_cert: Option, /// Allow writing TLS session keys to the given file pointed to by the environment variable `SSLKEYLOGFILE`. #[clap(long, alias = "allow-ssl-keylogfile")] allow_tls_keylogfile: bool, /// path to directory with TLS certificates for client postgres connections #[clap(long)] - certs_dir: Option, + certs_dir: Option, /// timeout for the TLS handshake #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)] handshake_timeout: tokio::time::Duration, @@ -159,8 +161,11 @@ struct ProxyCliArgs { #[clap(long, default_values_t = RateBucketInfo::DEFAULT_REDIS_SET)] redis_rps_limit: Vec, /// Cancellation channel size (max queue size for redis kv client) - #[clap(long, default_value = "1024")] + #[clap(long, default_value_t = 1024)] cancellation_ch_size: usize, + /// Cancellation ops batch size for redis + #[clap(long, default_value_t = 8)] + cancellation_batch_size: usize, /// cache for `allowed_ips` (use `size=0` to disable) #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)] allowed_ips_cache: String, @@ -229,6 +234,9 @@ struct ProxyCliArgs { // TODO: rename to `console_redirect_confirmation_timeout`. #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)] webauth_confirmation_timeout: std::time::Duration, + + #[clap(flatten)] + pg_sni_router: PgSniRouterArgs, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -277,6 +285,25 @@ struct SqlOverHttpArgs { sql_over_http_max_response_size_bytes: usize, } +#[derive(clap::Args, Clone, Debug)] +struct PgSniRouterArgs { + /// listen for incoming client connections on ip:port + #[clap(id = "sni-router-listen", long, default_value = "127.0.0.1:4432")] + listen: SocketAddr, + /// listen for incoming client connections on ip:port, requiring TLS to compute + #[clap(id = "sni-router-listen-tls", long, default_value = "127.0.0.1:4433")] + listen_tls: SocketAddr, + /// path to TLS key for client postgres connections + #[clap(id = "sni-router-tls-key", long)] + tls_key: Option, + /// path to TLS cert for client postgres connections + #[clap(id = "sni-router-tls-cert", long)] + tls_cert: Option, + /// append this domain zone to the SNI hostname to get the destination address + #[clap(id = "sni-router-destination", long)] + dest: Option, +} + pub async fn run() -> anyhow::Result<()> { let _logging_guard = crate::logging::init().await?; let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); @@ -307,73 +334,51 @@ pub async fn run() -> anyhow::Result<()> { Either::Right(auth_backend) => info!("Authentication backend: {auth_backend:?}"), } info!("Using region: {}", args.aws_region); - - // TODO: untangle the config args - let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { - ("plain", redis_url) => match redis_url { - None => { - bail!("plain auth requires redis_notifications to be set"); - } - Some(url) => { - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.clone())) - } - }, - ("irsa", _) => match (&args.redis_host, args.redis_port) { - (Some(host), Some(port)) => Some( - ConnectionWithCredentialsProvider::new_with_credentials_provider( - host.to_string(), - port, - elasticache::CredentialsProvider::new( - args.aws_region, - args.redis_cluster_name, - args.redis_user_id, - ) - .await, - ), - ), - (None, None) => { - warn!( - "irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client" - ); - None - } - _ => { - bail!("redis-host and redis-port must be specified together"); - } - }, - _ => { - bail!("unknown auth type given"); - } - }; - - let redis_notifications_client = if let Some(url) = args.redis_notifications { - Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url)) - } else { - regional_redis_client.clone() - }; + let (regional_redis_client, redis_notifications_client) = configure_redis(&args).await?; // Check that we can bind to address before further initialization - let http_address: SocketAddr = args.http.parse()?; - info!("Starting http on {http_address}"); - let http_listener = TcpListener::bind(http_address).await?.into_std()?; + info!("Starting http on {}", args.http); + let http_listener = TcpListener::bind(args.http).await?.into_std()?; - let mgmt_address: SocketAddr = args.mgmt.parse()?; - info!("Starting mgmt on {mgmt_address}"); - let mgmt_listener = TcpListener::bind(mgmt_address).await?; + info!("Starting mgmt on {}", args.mgmt); + let mgmt_listener = TcpListener::bind(args.mgmt).await?; let proxy_listener = if args.is_auth_broker { None } else { - let proxy_address: SocketAddr = args.proxy.parse()?; - info!("Starting proxy on {proxy_address}"); + info!("Starting proxy on {}", args.proxy); + Some(TcpListener::bind(args.proxy).await?) + }; - Some(TcpListener::bind(proxy_address).await?) + let sni_router_listeners = { + let args = &args.pg_sni_router; + if args.dest.is_some() { + ensure!( + args.tls_key.is_some(), + "sni-router-tls-key must be provided" + ); + ensure!( + args.tls_cert.is_some(), + "sni-router-tls-cert must be provided" + ); + + info!( + "Starting pg-sni-router on {} and {}", + args.listen, args.listen_tls + ); + + Some(( + TcpListener::bind(args.listen).await?, + TcpListener::bind(args.listen_tls).await?, + )) + } else { + None + } }; // TODO: rename the argument to something like serverless. // It now covers more than just websockets, it also covers SQL over HTTP. let serverless_listener = if let Some(serverless_address) = args.wss { - let serverless_address: SocketAddr = serverless_address.parse()?; info!("Starting wss on {serverless_address}"); Some(TcpListener::bind(serverless_address).await?) } else if args.is_auth_broker { @@ -458,6 +463,37 @@ pub async fn run() -> anyhow::Result<()> { } } + // spawn pg-sni-router mode. + if let Some((listen, listen_tls)) = sni_router_listeners { + let args = args.pg_sni_router; + let dest = args.dest.expect("already asserted it is set"); + let key_path = args.tls_key.expect("already asserted it is set"); + let cert_path = args.tls_cert.expect("already asserted it is set"); + + let (tls_config, tls_server_end_point) = + super::pg_sni_router::parse_tls(&key_path, &cert_path)?; + + let dest = Arc::new(dest); + + client_tasks.spawn(super::pg_sni_router::task_main( + dest.clone(), + tls_config.clone(), + None, + tls_server_end_point, + listen, + cancellation_token.clone(), + )); + + client_tasks.spawn(super::pg_sni_router::task_main( + dest, + tls_config, + Some(config.connect_to_compute.tls.clone()), + tls_server_end_point, + listen_tls, + cancellation_token.clone(), + )); + } + client_tasks.spawn(crate::context::parquet::worker( cancellation_token.clone(), args.parquet_upload, @@ -509,7 +545,12 @@ pub async fn run() -> anyhow::Result<()> { if let Some(mut redis_kv_client) = redis_kv_client { maintenance_tasks.spawn(async move { redis_kv_client.try_connect().await?; - handle_cancel_messages(&mut redis_kv_client, rx_cancel).await?; + handle_cancel_messages( + &mut redis_kv_client, + rx_cancel, + args.cancellation_batch_size, + ) + .await?; drop(redis_kv_client); @@ -565,7 +606,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { (Some(key_path), Some(cert_path)) => Some(config::configure_tls( key_path, cert_path, - args.certs_dir.as_ref(), + args.certs_dir.as_deref(), args.allow_tls_keylogfile, )?), (None, None) => None, @@ -675,7 +716,7 @@ fn build_auth_backend( args: &ProxyCliArgs, ) -> anyhow::Result, &'static ConsoleRedirectBackend>> { match &args.auth_backend { - AuthBackendType::ControlPlaneV1 => { + AuthBackendType::ControlPlane => { let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?; let project_info_cache_config: ProjectInfoCacheOptions = args.project_info_cache.parse()?; @@ -811,6 +852,60 @@ fn build_auth_backend( } } +async fn configure_redis( + args: &ProxyCliArgs, +) -> anyhow::Result<( + Option, + Option, +)> { + // TODO: untangle the config args + let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) { + ("plain", redis_url) => match redis_url { + None => { + bail!("plain auth requires redis_notifications to be set"); + } + Some(url) => { + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.clone())) + } + }, + ("irsa", _) => match (&args.redis_host, args.redis_port) { + (Some(host), Some(port)) => Some( + ConnectionWithCredentialsProvider::new_with_credentials_provider( + host.clone(), + port, + elasticache::CredentialsProvider::new( + args.aws_region.clone(), + args.redis_cluster_name.clone(), + args.redis_user_id.clone(), + ) + .await, + ), + ), + (None, None) => { + // todo: upgrade to error? + warn!( + "irsa auth requires redis-host and redis-port to be set, continuing without regional_redis_client" + ); + None + } + _ => { + bail!("redis-host and redis-port must be specified together"); + } + }, + _ => { + bail!("unknown auth type given"); + } + }; + + let redis_notifications_client = if let Some(url) = &args.redis_notifications { + Some(ConnectionWithCredentialsProvider::new_with_static_credentials(&**url)) + } else { + regional_redis_client.clone() + }; + + Ok((regional_redis_client, redis_notifications_client)) +} + #[cfg(test)] mod tests { use std::time::Duration; diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs index c5ba04eb8c..a6e7bf85a0 100644 --- a/proxy/src/cancellation.rs +++ b/proxy/src/cancellation.rs @@ -6,12 +6,12 @@ use ipnet::{IpNet, Ipv4Net, Ipv6Net}; use postgres_client::CancelToken; use postgres_client::tls::MakeTlsConnect; use pq_proto::CancelKeyData; -use redis::{FromRedisValue, Pipeline, Value, pipe}; +use redis::{Cmd, FromRedisValue, Value}; use serde::{Deserialize, Serialize}; use thiserror::Error; use tokio::net::TcpStream; use tokio::sync::{mpsc, oneshot}; -use tracing::{debug, info, warn}; +use tracing::{debug, error, info, warn}; use crate::auth::backend::ComputeUserInfo; use crate::auth::{AuthError, check_peer_addr_is_in_list}; @@ -30,8 +30,6 @@ use crate::tls::postgres_rustls::MakeRustlsConnect; type IpSubnetKey = IpNet; const CANCEL_KEY_TTL: i64 = 1_209_600; // 2 weeks cancellation key expire time -const REDIS_SEND_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(10); -const BATCH_SIZE: usize = 8; // Message types for sending through mpsc channel pub enum CancelKeyOp { @@ -56,8 +54,70 @@ pub enum CancelKeyOp { }, } +pub struct Pipeline { + inner: redis::Pipeline, + replies: Vec, +} + +impl Pipeline { + fn with_capacity(n: usize) -> Self { + Self { + inner: redis::Pipeline::with_capacity(n), + replies: Vec::with_capacity(n), + } + } + + async fn execute(&mut self, client: &mut RedisKVClient) { + let responses = self.replies.len(); + let batch_size = self.inner.len(); + + match client.query(&self.inner).await { + // for each reply, we expect that many values. + Ok(Value::Array(values)) if values.len() == responses => { + debug!( + batch_size, + responses, "successfully completed cancellation jobs", + ); + for (value, reply) in std::iter::zip(values, self.replies.drain(..)) { + reply.send_value(value); + } + } + Ok(value) => { + error!(batch_size, ?value, "unexpected redis return value"); + for reply in self.replies.drain(..) { + reply.send_err(anyhow!("incorrect response type from redis")); + } + } + Err(err) => { + for reply in self.replies.drain(..) { + reply.send_err(anyhow!("could not send cmd to redis: {err}")); + } + } + } + + self.inner.clear(); + self.replies.clear(); + } + + fn add_command_with_reply(&mut self, cmd: Cmd, reply: CancelReplyOp) { + self.inner.add_command(cmd); + self.replies.push(reply); + } + + fn add_command_no_reply(&mut self, cmd: Cmd) { + self.inner.add_command(cmd).ignore(); + } + + fn add_command(&mut self, cmd: Cmd, reply: Option) { + match reply { + Some(reply) => self.add_command_with_reply(cmd, reply), + None => self.add_command_no_reply(cmd), + } + } +} + impl CancelKeyOp { - fn register(self, pipe: &mut Pipeline) -> Option { + fn register(self, pipe: &mut Pipeline) { #[allow(clippy::used_underscore_binding)] match self { CancelKeyOp::StoreCancelKey { @@ -68,18 +128,18 @@ impl CancelKeyOp { _guard, expire, } => { - pipe.hset(&key, field, value); - pipe.expire(key, expire); - let resp_tx = resp_tx?; - Some(CancelReplyOp::StoreCancelKey { resp_tx, _guard }) + let reply = + resp_tx.map(|resp_tx| CancelReplyOp::StoreCancelKey { resp_tx, _guard }); + pipe.add_command(Cmd::hset(&key, field, value), reply); + pipe.add_command_no_reply(Cmd::expire(key, expire)); } CancelKeyOp::GetCancelData { key, resp_tx, _guard, } => { - pipe.hgetall(key); - Some(CancelReplyOp::GetCancelData { resp_tx, _guard }) + let reply = CancelReplyOp::GetCancelData { resp_tx, _guard }; + pipe.add_command_with_reply(Cmd::hgetall(key), reply); } CancelKeyOp::RemoveCancelKey { key, @@ -87,9 +147,9 @@ impl CancelKeyOp { resp_tx, _guard, } => { - pipe.hdel(key, field); - let resp_tx = resp_tx?; - Some(CancelReplyOp::RemoveCancelKey { resp_tx, _guard }) + let reply = + resp_tx.map(|resp_tx| CancelReplyOp::RemoveCancelKey { resp_tx, _guard }); + pipe.add_command(Cmd::hdel(key, field), reply); } } } @@ -169,12 +229,13 @@ impl CancelReplyOp { pub async fn handle_cancel_messages( client: &mut RedisKVClient, mut rx: mpsc::Receiver, + batch_size: usize, ) -> anyhow::Result<()> { - let mut batch = Vec::new(); - let mut replies = vec![]; + let mut batch = Vec::with_capacity(batch_size); + let mut pipeline = Pipeline::with_capacity(batch_size); loop { - if rx.recv_many(&mut batch, BATCH_SIZE).await == 0 { + if rx.recv_many(&mut batch, batch_size).await == 0 { warn!("shutting down cancellation queue"); break Ok(()); } @@ -182,42 +243,11 @@ pub async fn handle_cancel_messages( let batch_size = batch.len(); debug!(batch_size, "running cancellation jobs"); - let mut pipe = pipe(); for msg in batch.drain(..) { - if let Some(reply) = msg.register(&mut pipe) { - replies.push(reply); - } else { - pipe.ignore(); - } + msg.register(&mut pipeline); } - let responses = replies.len(); - - match client.query(pipe).await { - // for each reply, we expect that many values. - Ok(Value::Array(values)) if values.len() == responses => { - debug!( - batch_size, - responses, "successfully completed cancellation jobs", - ); - for (value, reply) in std::iter::zip(values, replies.drain(..)) { - reply.send_value(value); - } - } - Ok(value) => { - debug!(?value, "unexpected redis return value"); - for reply in replies.drain(..) { - reply.send_err(anyhow!("incorrect response type from redis")); - } - } - Err(err) => { - for reply in replies.drain(..) { - reply.send_err(anyhow!("could not send cmd to redis: {err}")); - } - } - } - - replies.clear(); + pipeline.execute(client).await; } } @@ -336,8 +366,7 @@ impl CancellationHandler { return Err(CancelError::InternalError); }; - tx.send_timeout(op, REDIS_SEND_TIMEOUT) - .await + tx.try_send(op) .map_err(|e| { tracing::warn!("failed to send GetCancelData for {key}: {e}"); }) @@ -539,7 +568,7 @@ impl Session { } // Send the store key op to the cancellation handler and set TTL for the key - pub(crate) async fn write_cancel_key( + pub(crate) fn write_cancel_key( &self, cancel_closure: CancelClosure, ) -> Result<(), CancelError> { @@ -565,14 +594,14 @@ impl Session { expire: CANCEL_KEY_TTL, }; - let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| { + let _ = tx.try_send(op).map_err(|e| { let key = self.key; tracing::warn!("failed to send StoreCancelKey for {key}: {e}"); }); Ok(()) } - pub(crate) async fn remove_cancel_key(&self) -> Result<(), CancelError> { + pub(crate) fn remove_cancel_key(&self) -> Result<(), CancelError> { let Some(tx) = &self.cancellation_handler.tx else { tracing::warn!("cancellation handler is not available"); return Err(CancelError::InternalError); @@ -588,7 +617,7 @@ impl Session { .guard(RedisMsgKind::HDel), }; - let _ = tx.send_timeout(op, REDIS_SEND_TIMEOUT).await.map_err(|e| { + let _ = tx.try_send(op).map_err(|e| { let key = self.key; tracing::warn!("failed to send RemoveCancelKey for {key}: {e}"); }); diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs index 0f2c3def0d..e3184e20d1 100644 --- a/proxy/src/console_redirect_proxy.rs +++ b/proxy/src/console_redirect_proxy.rs @@ -244,9 +244,7 @@ pub(crate) async fn handle_client( let cancellation_handler_clone = Arc::clone(&cancellation_handler); let session = cancellation_handler_clone.get_key(); - session - .write_cancel_key(node.cancel_closure.clone()) - .await?; + session.write_cancel_key(node.cancel_closure.clone())?; prepare_client_connection(&node, *session.key(), &mut stream).await?; diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs index 5f649d2b21..79aaf22990 100644 --- a/proxy/src/context/mod.rs +++ b/proxy/src/context/mod.rs @@ -78,7 +78,7 @@ struct RequestContextInner { #[derive(Clone, Debug)] pub(crate) enum AuthMethod { - // aka passwordless, fka link + // aka link ConsoleRedirect, ScramSha256, ScramSha256Plus, diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs index 5278fe2a3e..b0b5a598d1 100644 --- a/proxy/src/http/health_server.rs +++ b/proxy/src/http/health_server.rs @@ -3,7 +3,7 @@ use std::net::TcpListener; use std::sync::{Arc, Mutex}; use anyhow::{anyhow, bail}; -use http_utils::endpoint::{self, request_span}; +use http_utils::endpoint::{self, profile_cpu_handler, profile_heap_handler, request_span}; use http_utils::error::ApiError; use http_utils::json::json_response; use http_utils::{RouterBuilder, RouterService}; @@ -33,6 +33,12 @@ fn make_router(metrics: AppMetrics) -> RouterBuilder { request_span(r, move |b| prometheus_metrics_handler(b, state)) }) .get("/v1/status", status_handler) + .get("/profile/cpu", move |r| { + request_span(r, profile_cpu_handler) + }) + .get("/profile/heap", move |r| { + request_span(r, profile_heap_handler) + }) } pub async fn task_main( diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index e5fc0b724b..4b22c912eb 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -115,8 +115,8 @@ pub struct ProxyMetrics { #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))] pub allowed_vpc_endpoint_ids: Histogram<10>, - /// Number of connections (per sni). - pub accepted_connections_by_sni: CounterVec>, + /// Number of connections, by the method we used to determine the endpoint. + pub accepted_connections_by_sni: CounterVec, /// Number of connection failures (per kind). pub connection_failures_total: CounterVec>, @@ -342,11 +342,20 @@ pub enum LatencyExclusions { ClientCplaneComputeRetry, } +#[derive(LabelGroup)] +#[label(set = SniSet)] +pub struct SniGroup { + pub protocol: Protocol, + pub kind: SniKind, +} + #[derive(FixedCardinalityLabel, Copy, Clone)] -#[label(singleton = "kind")] pub enum SniKind { + /// Domain name based routing. SNI for libpq/websockets. Host for HTTP Sni, + /// Metadata based routing. `options` for libpq/websockets. Header for HTTP NoSni, + /// Metadata based routing, using the password field. PasswordHack, } diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs index cf331b8bc0..0a86022e78 100644 --- a/proxy/src/proxy/mod.rs +++ b/proxy/src/proxy/mod.rs @@ -383,9 +383,7 @@ pub(crate) async fn handle_client( let cancellation_handler_clone = Arc::clone(&cancellation_handler); let session = cancellation_handler_clone.get_key(); - session - .write_cancel_key(node.cancel_closure.clone()) - .await?; + session.write_cancel_key(node.cancel_closure.clone())?; prepare_client_connection(&node, *session.key(), &mut stream).await?; diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs index c100b8d716..8f9bd2de2d 100644 --- a/proxy/src/proxy/passthrough.rs +++ b/proxy/src/proxy/passthrough.rs @@ -94,7 +94,7 @@ impl ProxyPassthrough { tracing::warn!(session_id = ?self.session_id, ?err, "could not cancel the query in the database"); } - drop(self.cancel.remove_cancel_key().await); // we don't need a result. If the queue is full, we just log the error + drop(self.cancel.remove_cancel_key()); // we don't need a result. If the queue is full, we just log the error res } diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs index aa627b29a6..f71730c533 100644 --- a/proxy/src/redis/kv_ops.rs +++ b/proxy/src/redis/kv_ops.rs @@ -47,7 +47,7 @@ impl RedisKVClient { pub(crate) async fn query( &mut self, - q: impl Queryable, + q: &impl Queryable, ) -> anyhow::Result { if !self.limiter.check() { tracing::info!("Rate limit exceeded. Skipping query"); diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 6f24ad3dec..2a7069b1c2 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -56,6 +56,7 @@ use crate::serverless::backend::PoolingBackend; use crate::serverless::http_util::{api_error_into_response, json_response}; pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api"; +pub(crate) const AUTH_BROKER_SNI: &str = "apiauth"; pub async fn task_main( config: &'static ProxyConfig, diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index fee5942b7e..dfaeedaeae 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -38,7 +38,7 @@ use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig}; use crate::context::RequestContext; use crate::error::{ErrorKind, ReportableError, UserFacingError}; use crate::http::{ReadBodyError, read_body_with_limit}; -use crate::metrics::{HttpDirection, Metrics}; +use crate::metrics::{HttpDirection, Metrics, SniGroup, SniKind}; use crate::proxy::{NeonOptions, run_until_cancelled}; use crate::serverless::backend::HttpConnError; use crate::types::{DbName, RoleName}; @@ -227,6 +227,32 @@ fn get_conn_info( } } + // check the URL that was used, for metrics + { + let host_endpoint = headers + // get the host header + .get("host") + // extract the domain + .and_then(|h| { + let (host, _port) = h.to_str().ok()?.split_once(':')?; + Some(host) + }) + // get the endpoint prefix + .map(|h| h.split_once('.').map_or(h, |(prefix, _)| prefix)); + + let kind = if host_endpoint == Some(&*endpoint) { + SniKind::Sni + } else { + SniKind::NoSni + }; + + let protocol = ctx.protocol(); + Metrics::get() + .proxy + .accepted_connections_by_sni + .inc(SniGroup { protocol, kind }); + } + ctx.set_user_agent( headers .get(hyper::header::USER_AGENT) diff --git a/proxy/src/tls/server_config.rs b/proxy/src/tls/server_config.rs index 8f8917ef62..66c53b3aff 100644 --- a/proxy/src/tls/server_config.rs +++ b/proxy/src/tls/server_config.rs @@ -1,4 +1,5 @@ use std::collections::{HashMap, HashSet}; +use std::path::Path; use std::sync::Arc; use anyhow::{Context, bail}; @@ -21,9 +22,9 @@ pub struct TlsConfig { /// Configure TLS for the main endpoint. pub fn configure_tls( - key_path: &str, - cert_path: &str, - certs_dir: Option<&String>, + key_path: &Path, + cert_path: &Path, + certs_dir: Option<&Path>, allow_tls_keylogfile: bool, ) -> anyhow::Result { // add default certificate @@ -39,8 +40,7 @@ pub fn configure_tls( let key_path = path.join("tls.key"); let cert_path = path.join("tls.crt"); if key_path.exists() && cert_path.exists() { - cert_resolver - .add_cert_path(&key_path.to_string_lossy(), &cert_path.to_string_lossy())?; + cert_resolver.add_cert_path(&key_path, &cert_path)?; } } } @@ -86,7 +86,7 @@ pub struct CertResolver { } impl CertResolver { - fn parse_new(key_path: &str, cert_path: &str) -> anyhow::Result { + fn parse_new(key_path: &Path, cert_path: &Path) -> anyhow::Result { let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?; Self::new(priv_key, cert_chain) } @@ -103,7 +103,7 @@ impl CertResolver { Ok(Self { certs, default }) } - fn add_cert_path(&mut self, key_path: &str, cert_path: &str) -> anyhow::Result<()> { + fn add_cert_path(&mut self, key_path: &Path, cert_path: &Path) -> anyhow::Result<()> { let (priv_key, cert_chain) = parse_key_cert(key_path, cert_path)?; self.add_cert(priv_key, cert_chain) } @@ -124,26 +124,29 @@ impl CertResolver { } fn parse_key_cert( - key_path: &str, - cert_path: &str, + key_path: &Path, + cert_path: &Path, ) -> anyhow::Result<(PrivateKeyDer<'static>, Vec>)> { let priv_key = { let key_bytes = std::fs::read(key_path) - .with_context(|| format!("Failed to read TLS keys at '{key_path}'"))?; + .with_context(|| format!("Failed to read TLS keys at '{}'", key_path.display()))?; rustls_pemfile::private_key(&mut &key_bytes[..]) - .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? - .with_context(|| format!("Failed to parse TLS keys at '{key_path}'"))? + .with_context(|| format!("Failed to parse TLS keys at '{}'", key_path.display()))? + .with_context(|| format!("Failed to parse TLS keys at '{}'", key_path.display()))? }; - let cert_chain_bytes = std::fs::read(cert_path) - .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?; + let cert_chain_bytes = std::fs::read(cert_path).context(format!( + "Failed to read TLS cert file at '{}.'", + cert_path.display() + ))?; let cert_chain = { rustls_pemfile::certs(&mut &cert_chain_bytes[..]) .try_collect() .with_context(|| { format!( - "Failed to read TLS certificate chain from bytes from file at '{cert_path}'." + "Failed to read TLS certificate chain from bytes from file at '{}'.", + cert_path.display() ) })? }; diff --git a/rust-toolchain.toml b/rust-toolchain.toml index a0d5970bd5..c48def3483 100644 --- a/rust-toolchain.toml +++ b/rust-toolchain.toml @@ -1,5 +1,5 @@ [toolchain] -channel = "1.86.0" +channel = "1.87.0" profile = "default" # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy. # https://rust-lang.github.io/rustup/concepts/profiles.html diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs index 5849df0343..b364ac8e48 100644 --- a/safekeeper/client/src/mgmt_api.rs +++ b/safekeeper/client/src/mgmt_api.rs @@ -121,6 +121,20 @@ impl Client { resp.json().await.map_err(Error::ReceiveBody) } + pub async fn switch_timeline_membership( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineMembershipSwitchRequest, + ) -> Result { + let uri = format!( + "{}/v1/tenant/{}/timeline/{}/membership", + self.mgmt_api_endpoint, tenant_id, timeline_id + ); + let resp = self.put(&uri, req).await?; + resp.json().await.map_err(Error::ReceiveBody) + } + pub async fn delete_tenant(&self, tenant_id: TenantId) -> Result { let uri = format!("{}/v1/tenant/{}", self.mgmt_api_endpoint, tenant_id); let resp = self diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index c267a55cb6..8d31ada24f 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -22,9 +22,10 @@ use safekeeper::defaults::{ DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_SSL_CERT_FILE, DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE, }; +use safekeeper::wal_backup::WalBackup; use safekeeper::{ BACKGROUND_RUNTIME, BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf, - WAL_SERVICE_RUNTIME, broker, control_file, http, wal_backup, wal_service, + WAL_SERVICE_RUNTIME, broker, control_file, http, wal_service, }; use sd_notify::NotifyState; use storage_broker::{DEFAULT_ENDPOINT, Uri}; @@ -484,15 +485,15 @@ async fn start_safekeeper(conf: Arc) -> Result<()> { None => None, }; - let global_timelines = Arc::new(GlobalTimelines::new(conf.clone())); + let wal_backup = Arc::new(WalBackup::new(&conf).await?); + + let global_timelines = Arc::new(GlobalTimelines::new(conf.clone(), wal_backup.clone())); // Register metrics collector for active timelines. It's important to do this // after daemonizing, otherwise process collector will be upset. let timeline_collector = safekeeper::metrics::TimelineCollector::new(global_timelines.clone()); metrics::register_internal(Box::new(timeline_collector))?; - wal_backup::init_remote_storage(&conf).await; - // Keep handles to main tasks to die if any of them disappears. let mut tasks_handles: FuturesUnordered> = FuturesUnordered::new(); diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs index 11daff22cb..7984c2e2b9 100644 --- a/safekeeper/src/copy_timeline.rs +++ b/safekeeper/src/copy_timeline.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use anyhow::{Result, bail}; use camino::Utf8PathBuf; use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE}; +use remote_storage::GenericRemoteStorage; use safekeeper_api::membership::Configuration; use tokio::fs::OpenOptions; use tokio::io::{AsyncSeekExt, AsyncWriteExt}; @@ -30,6 +31,7 @@ pub struct Request { pub async fn handle_request( request: Request, global_timelines: Arc, + storage: Arc, ) -> Result<()> { // TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :( // if LSN will point to the middle of a WAL record, timeline will be in "broken" state @@ -127,6 +129,7 @@ pub async fn handle_request( assert!(first_ondisk_segment >= first_segment); copy_s3_segments( + &storage, wal_seg_size, &request.source_ttid, &request.destination_ttid, diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index 2b2d721db2..384c582678 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -243,8 +243,7 @@ async fn timeline_pull_handler(mut request: Request) -> Result) -> Result, // so create the chan and write to it in another task. @@ -270,6 +270,7 @@ async fn timeline_snapshot_handler(request: Request) -> Result) -> Result bool { - self.remote_storage.is_some() && self.wal_backup_enabled - } -} - impl SafeKeeperConf { pub fn dummy() -> Self { SafeKeeperConf { diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index 1510a51019..14aef1ee5e 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -7,7 +7,9 @@ use bytes::Bytes; use camino::Utf8PathBuf; use chrono::{DateTime, Utc}; use futures::{SinkExt, StreamExt, TryStreamExt}; +use http_utils::error::ApiError; use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; +use remote_storage::GenericRemoteStorage; use reqwest::Certificate; use safekeeper_api::Term; use safekeeper_api::models::{PullTimelineRequest, PullTimelineResponse, TimelineStatus}; @@ -30,7 +32,7 @@ use utils::pausable_failpoint; use crate::control_file::CONTROL_FILE_NAME; use crate::state::{EvictionState, TimelinePersistentState}; -use crate::timeline::{Timeline, WalResidentTimeline}; +use crate::timeline::{Timeline, TimelineError, WalResidentTimeline}; use crate::timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}; use crate::wal_storage::open_wal_file; use crate::{GlobalTimelines, debug_dump, wal_backup}; @@ -42,6 +44,7 @@ pub async fn stream_snapshot( source: NodeId, destination: NodeId, tx: mpsc::Sender>, + storage: Option>, ) { match tli.try_wal_residence_guard().await { Err(e) => { @@ -52,10 +55,32 @@ pub async fn stream_snapshot( Ok(maybe_resident_tli) => { if let Err(e) = match maybe_resident_tli { Some(resident_tli) => { - stream_snapshot_resident_guts(resident_tli, source, destination, tx.clone()) - .await + stream_snapshot_resident_guts( + resident_tli, + source, + destination, + tx.clone(), + storage, + ) + .await + } + None => { + if let Some(storage) = storage { + stream_snapshot_offloaded_guts( + tli, + source, + destination, + tx.clone(), + &storage, + ) + .await + } else { + tx.send(Err(anyhow!("remote storage not configured"))) + .await + .ok(); + return; + } } - None => stream_snapshot_offloaded_guts(tli, source, destination, tx.clone()).await, } { // Error type/contents don't matter as they won't can't reach the client // (hyper likely doesn't do anything with it), but http stream will be @@ -122,10 +147,12 @@ pub(crate) async fn stream_snapshot_offloaded_guts( source: NodeId, destination: NodeId, tx: mpsc::Sender>, + storage: &GenericRemoteStorage, ) -> Result<()> { let mut ar = prepare_tar_stream(tx); - tli.snapshot_offloaded(&mut ar, source, destination).await?; + tli.snapshot_offloaded(&mut ar, source, destination, storage) + .await?; ar.finish().await?; @@ -138,10 +165,13 @@ pub async fn stream_snapshot_resident_guts( source: NodeId, destination: NodeId, tx: mpsc::Sender>, + storage: Option>, ) -> Result<()> { let mut ar = prepare_tar_stream(tx); - let bctx = tli.start_snapshot(&mut ar, source, destination).await?; + let bctx = tli + .start_snapshot(&mut ar, source, destination, storage) + .await?; pausable_failpoint!("sk-snapshot-after-list-pausable"); let tli_dir = tli.get_timeline_dir(); @@ -181,6 +211,7 @@ impl Timeline { ar: &mut tokio_tar::Builder, source: NodeId, destination: NodeId, + storage: &GenericRemoteStorage, ) -> Result<()> { // Take initial copy of control file, then release state lock let mut control_file = { @@ -215,6 +246,7 @@ impl Timeline { // can fail if the timeline was un-evicted and modified in the background. let remote_timeline_path = &self.remote_path; wal_backup::copy_partial_segment( + storage, &replace.previous.remote_path(remote_timeline_path), &replace.current.remote_path(remote_timeline_path), ) @@ -261,6 +293,7 @@ impl WalResidentTimeline { ar: &mut tokio_tar::Builder, source: NodeId, destination: NodeId, + storage: Option>, ) -> Result { let mut shared_state = self.write_shared_state().await; let wal_seg_size = shared_state.get_wal_seg_size(); @@ -282,6 +315,7 @@ impl WalResidentTimeline { let remote_timeline_path = &self.tli.remote_path; wal_backup::copy_partial_segment( + &*storage.context("remote storage not configured")?, &replace.previous.remote_path(remote_timeline_path), &replace.current.remote_path(remote_timeline_path), ) @@ -395,7 +429,7 @@ pub async fn handle_request( sk_auth_token: Option, ssl_ca_certs: Vec, global_timelines: Arc, -) -> Result { +) -> Result { let existing_tli = global_timelines.get(TenantTimelineId::new( request.tenant_id, request.timeline_id, @@ -411,7 +445,9 @@ pub async fn handle_request( for ssl_ca_cert in ssl_ca_certs { http_client = http_client.add_root_certificate(ssl_ca_cert); } - let http_client = http_client.build()?; + let http_client = http_client + .build() + .map_err(|e| ApiError::InternalServerError(e.into()))?; let http_hosts = request.http_hosts.clone(); @@ -443,10 +479,10 @@ pub async fn handle_request( // offline and C comes online. Then we want a pull on C with A and B as hosts to work. let min_required_successful = (http_hosts.len() - 1).max(1); if statuses.len() < min_required_successful { - bail!( + return Err(ApiError::InternalServerError(anyhow::anyhow!( "only got {} successful status responses. required: {min_required_successful}", statuses.len() - ) + ))); } // Find the most advanced safekeeper @@ -465,14 +501,32 @@ pub async fn handle_request( assert!(status.tenant_id == request.tenant_id); assert!(status.timeline_id == request.timeline_id); - pull_timeline( + let check_tombstone = !request.ignore_tombstone.unwrap_or_default(); + + match pull_timeline( status, safekeeper_host, sk_auth_token, http_client, global_timelines, + check_tombstone, ) .await + { + Ok(resp) => Ok(resp), + Err(e) => { + match e.downcast_ref::() { + Some(TimelineError::AlreadyExists(_)) => Ok(PullTimelineResponse { + safekeeper_host: None, + }), + Some(TimelineError::CreationInProgress(_)) => { + // We don't return success here because creation might still fail. + Err(ApiError::Conflict("Creation in progress".to_owned())) + } + _ => Err(ApiError::InternalServerError(e)), + } + } + } } async fn pull_timeline( @@ -481,6 +535,7 @@ async fn pull_timeline( sk_auth_token: Option, http_client: reqwest::Client, global_timelines: Arc, + check_tombstone: bool, ) -> Result { let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id); info!( @@ -552,7 +607,7 @@ async fn pull_timeline( // Finally, load the timeline. let _tli = global_timelines - .load_temp_timeline(ttid, &tli_dir_path, false) + .load_temp_timeline(ttid, &tli_dir_path, check_tombstone) .await?; Ok(PullTimelineResponse { diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs index 33e3d0485c..05f827494e 100644 --- a/safekeeper/src/send_wal.rs +++ b/safekeeper/src/send_wal.rs @@ -513,7 +513,7 @@ impl SafekeeperPostgresHandler { let end_pos = end_watch.get(); if end_pos < start_pos { - warn!( + info!( "requested start_pos {} is ahead of available WAL end_pos {}", start_pos, end_pos ); diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs index 618e2b59d2..e2817c8337 100644 --- a/safekeeper/src/test_utils.rs +++ b/safekeeper/src/test_utils.rs @@ -18,7 +18,7 @@ use crate::send_wal::EndWatch; use crate::state::{TimelinePersistentState, TimelineState}; use crate::timeline::{SharedState, StateSK, Timeline, get_timeline_dir}; use crate::timelines_set::TimelinesSet; -use crate::wal_backup::remote_timeline_path; +use crate::wal_backup::{WalBackup, remote_timeline_path}; use crate::{SafeKeeperConf, control_file, receive_wal, wal_storage}; /// A Safekeeper testing or benchmarking environment. Uses a tempdir for storage, removed on drop. @@ -101,18 +101,22 @@ impl Env { let safekeeper = self.make_safekeeper(node_id, ttid, start_lsn).await?; let shared_state = SharedState::new(StateSK::Loaded(safekeeper)); + let wal_backup = Arc::new(WalBackup::new(&conf).await?); + let timeline = Timeline::new( ttid, &timeline_dir, &remote_path, shared_state, conf.clone(), + wal_backup.clone(), ); timeline.bootstrap( &mut timeline.write_shared_state().await, &conf, Arc::new(TimelinesSet::default()), // ignored for now RateLimiter::new(0, 0), + wal_backup, ); Ok(timeline) } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index b7ba28f435..588bd4f2c9 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -35,7 +35,8 @@ use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, Tim use crate::timeline_guard::ResidenceGuard; use crate::timeline_manager::{AtomicStatus, ManagerCtl}; use crate::timelines_set::TimelinesSet; -use crate::wal_backup::{self, remote_timeline_path}; +use crate::wal_backup; +use crate::wal_backup::{WalBackup, remote_timeline_path}; use crate::wal_backup_partial::PartialRemoteSegment; use crate::wal_storage::{Storage as wal_storage_iface, WalReader}; use crate::{SafeKeeperConf, control_file, debug_dump, timeline_manager, wal_storage}; @@ -452,6 +453,8 @@ pub struct Timeline { manager_ctl: ManagerCtl, conf: Arc, + pub(crate) wal_backup: Arc, + remote_deletion: std::sync::Mutex>, /// Hold this gate from code that depends on the Timeline's non-shut-down state. While holding @@ -476,6 +479,7 @@ impl Timeline { remote_path: &RemotePath, shared_state: SharedState, conf: Arc, + wal_backup: Arc, ) -> Arc { let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(shared_state.sk.state().commit_lsn); @@ -509,6 +513,7 @@ impl Timeline { wal_backup_active: AtomicBool::new(false), last_removed_segno: AtomicU64::new(0), mgr_status: AtomicStatus::new(), + wal_backup, }) } @@ -516,6 +521,7 @@ impl Timeline { pub fn load_timeline( conf: Arc, ttid: TenantTimelineId, + wal_backup: Arc, ) -> Result> { let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered(); @@ -529,6 +535,7 @@ impl Timeline { &remote_path, shared_state, conf, + wal_backup, )) } @@ -539,6 +546,7 @@ impl Timeline { conf: &SafeKeeperConf, broker_active_set: Arc, partial_backup_rate_limiter: RateLimiter, + wal_backup: Arc, ) { let (tx, rx) = self.manager_ctl.bootstrap_manager(); @@ -561,6 +569,7 @@ impl Timeline { tx, rx, partial_backup_rate_limiter, + wal_backup, ) .await } @@ -606,9 +615,10 @@ impl Timeline { // it is cancelled, so WAL storage won't be opened again. shared_state.sk.close_wal_store(); - if !only_local && self.conf.is_wal_backup_enabled() { + if !only_local { self.remote_delete().await?; } + let dir_existed = delete_dir(&self.timeline_dir).await?; Ok(dir_existed) } @@ -675,11 +685,20 @@ impl Timeline { guard: &mut std::sync::MutexGuard>, ) -> RemoteDeletionReceiver { tracing::info!("starting remote deletion"); + let storage = self.wal_backup.get_storage().clone(); let (result_tx, result_rx) = tokio::sync::watch::channel(None); let ttid = self.ttid; tokio::task::spawn( async move { - let r = wal_backup::delete_timeline(&ttid).await; + let r = if let Some(storage) = storage { + wal_backup::delete_timeline(&storage, &ttid).await + } else { + tracing::info!( + "skipping remote deletion because no remote storage is configured; this effectively leaks the objects in remote storage" + ); + Ok(()) + }; + if let Err(e) = &r { // Log error here in case nobody ever listens for our result (e.g. dropped API request) tracing::error!("remote deletion failed: {e}"); @@ -1046,14 +1065,13 @@ impl WalResidentTimeline { pub async fn get_walreader(&self, start_lsn: Lsn) -> Result { let (_, persisted_state) = self.get_state().await; - let enable_remote_read = self.conf.is_wal_backup_enabled(); WalReader::new( &self.ttid, self.timeline_dir.clone(), &persisted_state, start_lsn, - enable_remote_read, + self.wal_backup.clone(), ) } diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index 84c636daf6..e817dbf6f9 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -6,7 +6,7 @@ use anyhow::Context; use camino::Utf8PathBuf; -use remote_storage::RemotePath; +use remote_storage::{GenericRemoteStorage, RemotePath}; use tokio::fs::File; use tokio::io::{AsyncRead, AsyncWriteExt}; use tracing::{debug, info, instrument, warn}; @@ -68,6 +68,10 @@ impl Manager { #[instrument(name = "evict_timeline", skip_all)] pub(crate) async fn evict_timeline(&mut self) -> bool { assert!(!self.is_offloaded); + let Some(storage) = self.wal_backup.get_storage() else { + warn!("no remote storage configured, skipping uneviction"); + return false; + }; let partial_backup_uploaded = match &self.partial_backup_uploaded { Some(p) => p.clone(), None => { @@ -87,7 +91,7 @@ impl Manager { .inc(); }); - if let Err(e) = do_eviction(self, &partial_backup_uploaded).await { + if let Err(e) = do_eviction(self, &partial_backup_uploaded, &storage).await { warn!("failed to evict timeline: {:?}", e); return false; } @@ -102,6 +106,10 @@ impl Manager { #[instrument(name = "unevict_timeline", skip_all)] pub(crate) async fn unevict_timeline(&mut self) { assert!(self.is_offloaded); + let Some(storage) = self.wal_backup.get_storage() else { + warn!("no remote storage configured, skipping uneviction"); + return; + }; let partial_backup_uploaded = match &self.partial_backup_uploaded { Some(p) => p.clone(), None => { @@ -121,7 +129,7 @@ impl Manager { .inc(); }); - if let Err(e) = do_uneviction(self, &partial_backup_uploaded).await { + if let Err(e) = do_uneviction(self, &partial_backup_uploaded, &storage).await { warn!("failed to unevict timeline: {:?}", e); return; } @@ -137,8 +145,12 @@ impl Manager { /// Ensure that content matches the remote partial backup, if local segment exists. /// Then change state in control file and in-memory. If `delete_offloaded_wal` is set, /// delete the local segment. -async fn do_eviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> { - compare_local_segment_with_remote(mgr, partial).await?; +async fn do_eviction( + mgr: &mut Manager, + partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, +) -> anyhow::Result<()> { + compare_local_segment_with_remote(mgr, partial, storage).await?; mgr.tli.switch_to_offloaded(partial).await?; // switch manager state as soon as possible @@ -153,12 +165,16 @@ async fn do_eviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyho /// Ensure that content matches the remote partial backup, if local segment exists. /// Then download segment to local disk and change state in control file and in-memory. -async fn do_uneviction(mgr: &mut Manager, partial: &PartialRemoteSegment) -> anyhow::Result<()> { +async fn do_uneviction( + mgr: &mut Manager, + partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, +) -> anyhow::Result<()> { // if the local segment is present, validate it - compare_local_segment_with_remote(mgr, partial).await?; + compare_local_segment_with_remote(mgr, partial, storage).await?; // atomically download the partial segment - redownload_partial_segment(mgr, partial).await?; + redownload_partial_segment(mgr, partial, storage).await?; mgr.tli.switch_to_present().await?; // switch manager state as soon as possible @@ -181,6 +197,7 @@ async fn delete_local_segment(mgr: &Manager, partial: &PartialRemoteSegment) -> async fn redownload_partial_segment( mgr: &Manager, partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, ) -> anyhow::Result<()> { let tmp_file = mgr.tli.timeline_dir().join("remote_partial.tmp"); let remote_segfile = remote_segment_path(mgr, partial); @@ -190,7 +207,7 @@ async fn redownload_partial_segment( remote_segfile, tmp_file ); - let mut reader = wal_backup::read_object(&remote_segfile, 0).await?; + let mut reader = wal_backup::read_object(storage, &remote_segfile, 0).await?; let mut file = File::create(&tmp_file).await?; let actual_len = tokio::io::copy(&mut reader, &mut file).await?; @@ -234,13 +251,16 @@ async fn redownload_partial_segment( async fn compare_local_segment_with_remote( mgr: &Manager, partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, ) -> anyhow::Result<()> { let local_path = local_segment_path(mgr, partial); match File::open(&local_path).await { - Ok(mut local_file) => do_validation(mgr, &mut local_file, mgr.wal_seg_size, partial) - .await - .context("validation failed"), + Ok(mut local_file) => { + do_validation(mgr, &mut local_file, mgr.wal_seg_size, partial, storage) + .await + .context("validation failed") + } Err(_) => { info!( "local WAL file {} is not present, skipping validation", @@ -258,6 +278,7 @@ async fn do_validation( file: &mut File, wal_seg_size: usize, partial: &PartialRemoteSegment, + storage: &GenericRemoteStorage, ) -> anyhow::Result<()> { let local_size = file.metadata().await?.len() as usize; if local_size != wal_seg_size { @@ -270,7 +291,7 @@ async fn do_validation( let remote_segfile = remote_segment_path(mgr, partial); let mut remote_reader: std::pin::Pin> = - wal_backup::read_object(&remote_segfile, 0).await?; + wal_backup::read_object(storage, &remote_segfile, 0).await?; // remote segment should have bytes excatly up to `flush_lsn` let expected_remote_size = partial.flush_lsn.segment_offset(mgr.wal_seg_size); diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index 71e99a4de7..48eda92fed 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -35,7 +35,7 @@ use crate::state::TimelineState; use crate::timeline::{ManagerTimeline, ReadGuardSharedState, StateSK, WalResidentTimeline}; use crate::timeline_guard::{AccessService, GuardId, ResidenceGuard}; use crate::timelines_set::{TimelineSetGuard, TimelinesSet}; -use crate::wal_backup::{self, WalBackupTaskHandle}; +use crate::wal_backup::{self, WalBackup, WalBackupTaskHandle}; use crate::wal_backup_partial::{self, PartialBackup, PartialRemoteSegment}; pub(crate) struct StateSnapshot { @@ -200,6 +200,7 @@ pub(crate) struct Manager { pub(crate) conf: SafeKeeperConf, pub(crate) wal_seg_size: usize, pub(crate) walsenders: Arc, + pub(crate) wal_backup: Arc, // current state pub(crate) state_version_rx: tokio::sync::watch::Receiver, @@ -238,6 +239,7 @@ pub async fn main_task( manager_tx: tokio::sync::mpsc::UnboundedSender, mut manager_rx: tokio::sync::mpsc::UnboundedReceiver, global_rate_limiter: RateLimiter, + wal_backup: Arc, ) { tli.set_status(Status::Started); @@ -256,6 +258,7 @@ pub async fn main_task( broker_active_set, manager_tx, global_rate_limiter, + wal_backup, ) .await; @@ -371,7 +374,7 @@ pub async fn main_task( mgr.tli_broker_active.set(false); // shutdown background tasks - if mgr.conf.is_wal_backup_enabled() { + if let Some(storage) = mgr.wal_backup.get_storage() { if let Some(backup_task) = mgr.backup_task.take() { // If we fell through here, then the timeline is shutting down. This is important // because otherwise joining on the wal_backup handle might hang. @@ -379,7 +382,7 @@ pub async fn main_task( backup_task.join().await; } - wal_backup::update_task(&mut mgr, false, &last_state).await; + wal_backup::update_task(&mut mgr, storage, false, &last_state).await; } if let Some(recovery_task) = &mut mgr.recovery_task { @@ -415,11 +418,13 @@ impl Manager { broker_active_set: Arc, manager_tx: tokio::sync::mpsc::UnboundedSender, global_rate_limiter: RateLimiter, + wal_backup: Arc, ) -> Manager { let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await; Manager { wal_seg_size: tli.get_wal_seg_size().await, walsenders: tli.get_walsenders().clone(), + wal_backup, state_version_rx: tli.get_state_version_rx(), num_computes_rx: tli.get_walreceivers().get_num_rx(), tli_broker_active: broker_active_set.guard(tli.clone()), @@ -477,8 +482,8 @@ impl Manager { let is_wal_backup_required = wal_backup::is_wal_backup_required(self.wal_seg_size, num_computes, state); - if self.conf.is_wal_backup_enabled() { - wal_backup::update_task(self, is_wal_backup_required, state).await; + if let Some(storage) = self.wal_backup.get_storage() { + wal_backup::update_task(self, storage, is_wal_backup_required, state).await; } // update the state in Arc @@ -624,9 +629,9 @@ impl Manager { /// Spawns partial WAL backup task if needed. async fn update_partial_backup(&mut self, state: &StateSnapshot) { // check if WAL backup is enabled and should be started - if !self.conf.is_wal_backup_enabled() { + let Some(storage) = self.wal_backup.get_storage() else { return; - } + }; if self.partial_backup_task.is_some() { // partial backup is already running @@ -650,6 +655,7 @@ impl Manager { self.conf.clone(), self.global_rate_limiter.clone(), cancel.clone(), + storage, )); self.partial_backup_task = Some((handle, cancel)); } @@ -669,6 +675,10 @@ impl Manager { /// Reset partial backup state and remove its remote storage data. Since it /// might concurrently uploading something, cancel the task first. async fn backup_partial_reset(&mut self) -> anyhow::Result> { + let Some(storage) = self.wal_backup.get_storage() else { + anyhow::bail!("remote storage is not enabled"); + }; + info!("resetting partial backup state"); // Force unevict timeline if it is evicted before erasing partial backup // state. The intended use of this function is to drop corrupted remote @@ -689,7 +699,7 @@ impl Manager { } let tli = self.wal_resident_timeline()?; - let mut partial_backup = PartialBackup::new(tli, self.conf.clone()).await; + let mut partial_backup = PartialBackup::new(tli, self.conf.clone(), storage).await; // Reset might fail e.g. when cfile is already reset but s3 removal // failed, so set manager state to None beforehand. In any case caller // is expected to retry until success. diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 41abee369e..af33bcbd20 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -25,6 +25,7 @@ use crate::rate_limit::RateLimiter; use crate::state::TimelinePersistentState; use crate::timeline::{Timeline, TimelineError, delete_dir, get_tenant_dir, get_timeline_dir}; use crate::timelines_set::TimelinesSet; +use crate::wal_backup::WalBackup; use crate::wal_storage::Storage; use crate::{SafeKeeperConf, control_file, wal_storage}; @@ -47,15 +48,24 @@ struct GlobalTimelinesState { conf: Arc, broker_active_set: Arc, global_rate_limiter: RateLimiter, + wal_backup: Arc, } impl GlobalTimelinesState { /// Get dependencies for a timeline constructor. - fn get_dependencies(&self) -> (Arc, Arc, RateLimiter) { + fn get_dependencies( + &self, + ) -> ( + Arc, + Arc, + RateLimiter, + Arc, + ) { ( self.conf.clone(), self.broker_active_set.clone(), self.global_rate_limiter.clone(), + self.wal_backup.clone(), ) } @@ -84,7 +94,7 @@ pub struct GlobalTimelines { impl GlobalTimelines { /// Create a new instance of the global timelines map. - pub fn new(conf: Arc) -> Self { + pub fn new(conf: Arc, wal_backup: Arc) -> Self { Self { state: Mutex::new(GlobalTimelinesState { timelines: HashMap::new(), @@ -92,6 +102,7 @@ impl GlobalTimelines { conf, broker_active_set: Arc::new(TimelinesSet::default()), global_rate_limiter: RateLimiter::new(1, 1), + wal_backup, }), } } @@ -147,7 +158,7 @@ impl GlobalTimelines { /// just lock and unlock it for each timeline -- this function is called /// during init when nothing else is running, so this is fine. async fn load_tenant_timelines(&self, tenant_id: TenantId) -> Result<()> { - let (conf, broker_active_set, partial_backup_rate_limiter) = { + let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup) = { let state = self.state.lock().unwrap(); state.get_dependencies() }; @@ -162,7 +173,7 @@ impl GlobalTimelines { TimelineId::from_str(timeline_dir_entry.file_name().to_str().unwrap_or("")) { let ttid = TenantTimelineId::new(tenant_id, timeline_id); - match Timeline::load_timeline(conf.clone(), ttid) { + match Timeline::load_timeline(conf.clone(), ttid, wal_backup.clone()) { Ok(tli) => { let mut shared_state = tli.write_shared_state().await; self.state @@ -175,6 +186,7 @@ impl GlobalTimelines { &conf, broker_active_set.clone(), partial_backup_rate_limiter.clone(), + wal_backup.clone(), ); } // If we can't load a timeline, it's most likely because of a corrupted @@ -212,6 +224,10 @@ impl GlobalTimelines { self.state.lock().unwrap().broker_active_set.clone() } + pub fn get_wal_backup(&self) -> Arc { + self.state.lock().unwrap().wal_backup.clone() + } + /// Create a new timeline with the given id. If the timeline already exists, returns /// an existing timeline. pub(crate) async fn create( @@ -222,7 +238,7 @@ impl GlobalTimelines { start_lsn: Lsn, commit_lsn: Lsn, ) -> Result> { - let (conf, _, _) = { + let (conf, _, _, _) = { let state = self.state.lock().unwrap(); if let Ok(timeline) = state.get(&ttid) { // Timeline already exists, return it. @@ -267,7 +283,7 @@ impl GlobalTimelines { check_tombstone: bool, ) -> Result> { // Check for existence and mark that we're creating it. - let (conf, broker_active_set, partial_backup_rate_limiter) = { + let (conf, broker_active_set, partial_backup_rate_limiter, wal_backup) = { let mut state = self.state.lock().unwrap(); match state.timelines.get(&ttid) { Some(GlobalMapTimeline::CreationInProgress) => { @@ -296,7 +312,14 @@ impl GlobalTimelines { }; // Do the actual move and reflect the result in the map. - match GlobalTimelines::install_temp_timeline(ttid, tmp_path, conf.clone()).await { + match GlobalTimelines::install_temp_timeline( + ttid, + tmp_path, + conf.clone(), + wal_backup.clone(), + ) + .await + { Ok(timeline) => { let mut timeline_shared_state = timeline.write_shared_state().await; let mut state = self.state.lock().unwrap(); @@ -314,6 +337,7 @@ impl GlobalTimelines { &conf, broker_active_set, partial_backup_rate_limiter, + wal_backup, ); drop(timeline_shared_state); Ok(timeline) @@ -336,6 +360,7 @@ impl GlobalTimelines { ttid: TenantTimelineId, tmp_path: &Utf8PathBuf, conf: Arc, + wal_backup: Arc, ) -> Result> { let tenant_path = get_tenant_dir(conf.as_ref(), &ttid.tenant_id); let timeline_path = get_timeline_dir(conf.as_ref(), &ttid); @@ -377,7 +402,7 @@ impl GlobalTimelines { // Do the move. durable_rename(tmp_path, &timeline_path, !conf.no_sync).await?; - Timeline::load_timeline(conf, ttid) + Timeline::load_timeline(conf, ttid, wal_backup) } /// Get a timeline from the global map. If it's not present, it doesn't exist on disk, diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs index 56f4a2faf9..0beb272a60 100644 --- a/safekeeper/src/wal_backup.rs +++ b/safekeeper/src/wal_backup.rs @@ -2,6 +2,7 @@ use std::cmp::min; use std::collections::HashSet; use std::num::NonZeroU32; use std::pin::Pin; +use std::sync::Arc; use std::time::Duration; use anyhow::{Context, Result}; @@ -17,7 +18,7 @@ use safekeeper_api::models::PeerInfo; use tokio::fs::File; use tokio::select; use tokio::sync::mpsc::{self, Receiver, Sender}; -use tokio::sync::{OnceCell, watch}; +use tokio::sync::watch; use tokio::task::JoinHandle; use tokio_util::sync::CancellationToken; use tracing::*; @@ -63,7 +64,12 @@ pub(crate) fn is_wal_backup_required( /// Based on peer information determine which safekeeper should offload; if it /// is me, run (per timeline) task, if not yet. OTOH, if it is not me and task /// is running, kill it. -pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &StateSnapshot) { +pub(crate) async fn update_task( + mgr: &mut Manager, + storage: Arc, + need_backup: bool, + state: &StateSnapshot, +) { let (offloader, election_dbg_str) = determine_offloader(&state.peers, state.backup_lsn, mgr.tli.ttid, &mgr.conf); let elected_me = Some(mgr.conf.my_id) == offloader; @@ -82,7 +88,12 @@ pub(crate) async fn update_task(mgr: &mut Manager, need_backup: bool, state: &St return; }; - let async_task = backup_task_main(resident, mgr.conf.backup_parallel_jobs, shutdown_rx); + let async_task = backup_task_main( + resident, + storage, + mgr.conf.backup_parallel_jobs, + shutdown_rx, + ); let handle = if mgr.conf.current_thread_runtime { tokio::spawn(async_task) @@ -169,33 +180,31 @@ fn determine_offloader( } } -static REMOTE_STORAGE: OnceCell> = OnceCell::const_new(); - -// Storage must be configured and initialized when this is called. -fn get_configured_remote_storage() -> &'static GenericRemoteStorage { - REMOTE_STORAGE - .get() - .expect("failed to get remote storage") - .as_ref() - .unwrap() +pub struct WalBackup { + storage: Option>, } -pub async fn init_remote_storage(conf: &SafeKeeperConf) { - // TODO: refactor REMOTE_STORAGE to avoid using global variables, and provide - // dependencies to all tasks instead. - REMOTE_STORAGE - .get_or_init(|| async { - if let Some(conf) = conf.remote_storage.as_ref() { - Some( - GenericRemoteStorage::from_config(conf) - .await - .expect("failed to create remote storage"), - ) - } else { - None +impl WalBackup { + /// Create a new WalBackup instance. + pub async fn new(conf: &SafeKeeperConf) -> Result { + if !conf.wal_backup_enabled { + return Ok(Self { storage: None }); + } + + match conf.remote_storage.as_ref() { + Some(config) => { + let storage = GenericRemoteStorage::from_config(config).await?; + Ok(Self { + storage: Some(Arc::new(storage)), + }) } - }) - .await; + None => Ok(Self { storage: None }), + } + } + + pub fn get_storage(&self) -> Option> { + self.storage.clone() + } } struct WalBackupTask { @@ -204,12 +213,14 @@ struct WalBackupTask { wal_seg_size: usize, parallel_jobs: usize, commit_lsn_watch_rx: watch::Receiver, + storage: Arc, } /// Offload single timeline. #[instrument(name = "wal_backup", skip_all, fields(ttid = %tli.ttid))] async fn backup_task_main( tli: WalResidentTimeline, + storage: Arc, parallel_jobs: usize, mut shutdown_rx: Receiver<()>, ) { @@ -223,6 +234,7 @@ async fn backup_task_main( timeline_dir: tli.get_timeline_dir(), timeline: tli, parallel_jobs, + storage, }; // task is spinned up only when wal_seg_size already initialized @@ -293,6 +305,7 @@ impl WalBackupTask { match backup_lsn_range( &self.timeline, + self.storage.clone(), &mut backup_lsn, commit_lsn, self.wal_seg_size, @@ -322,6 +335,7 @@ impl WalBackupTask { async fn backup_lsn_range( timeline: &WalResidentTimeline, + storage: Arc, backup_lsn: &mut Lsn, end_lsn: Lsn, wal_seg_size: usize, @@ -352,7 +366,12 @@ async fn backup_lsn_range( loop { let added_task = match iter.next() { Some(s) => { - uploads.push_back(backup_single_segment(s, timeline_dir, remote_timeline_path)); + uploads.push_back(backup_single_segment( + &storage, + s, + timeline_dir, + remote_timeline_path, + )); true } None => false, @@ -388,6 +407,7 @@ async fn backup_lsn_range( } async fn backup_single_segment( + storage: &GenericRemoteStorage, seg: &Segment, timeline_dir: &Utf8Path, remote_timeline_path: &RemotePath, @@ -395,7 +415,13 @@ async fn backup_single_segment( let segment_file_path = seg.file_path(timeline_dir)?; let remote_segment_path = seg.remote_path(remote_timeline_path); - let res = backup_object(&segment_file_path, &remote_segment_path, seg.size()).await; + let res = backup_object( + storage, + &segment_file_path, + &remote_segment_path, + seg.size(), + ) + .await; if res.is_ok() { BACKED_UP_SEGMENTS.inc(); } else { @@ -455,12 +481,11 @@ fn get_segments(start: Lsn, end: Lsn, seg_size: usize) -> Vec { } async fn backup_object( + storage: &GenericRemoteStorage, source_file: &Utf8Path, target_file: &RemotePath, size: usize, ) -> Result<()> { - let storage = get_configured_remote_storage(); - let file = File::open(&source_file) .await .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?; @@ -475,12 +500,11 @@ async fn backup_object( } pub(crate) async fn backup_partial_segment( + storage: &GenericRemoteStorage, source_file: &Utf8Path, target_file: &RemotePath, size: usize, ) -> Result<()> { - let storage = get_configured_remote_storage(); - let file = File::open(&source_file) .await .with_context(|| format!("Failed to open file {source_file:?} for wal backup"))?; @@ -504,25 +528,20 @@ pub(crate) async fn backup_partial_segment( } pub(crate) async fn copy_partial_segment( + storage: &GenericRemoteStorage, source: &RemotePath, destination: &RemotePath, ) -> Result<()> { - let storage = get_configured_remote_storage(); let cancel = CancellationToken::new(); storage.copy_object(source, destination, &cancel).await } pub async fn read_object( + storage: &GenericRemoteStorage, file_path: &RemotePath, offset: u64, ) -> anyhow::Result>> { - let storage = REMOTE_STORAGE - .get() - .context("Failed to get remote storage")? - .as_ref() - .context("No remote storage configured")?; - info!("segment download about to start from remote path {file_path:?} at offset {offset}"); let cancel = CancellationToken::new(); @@ -547,8 +566,10 @@ pub async fn read_object( /// Delete WAL files for the given timeline. Remote storage must be configured /// when called. -pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { - let storage = get_configured_remote_storage(); +pub async fn delete_timeline( + storage: &GenericRemoteStorage, + ttid: &TenantTimelineId, +) -> Result<()> { let remote_path = remote_timeline_path(ttid)?; // see DEFAULT_MAX_KEYS_PER_LIST_RESPONSE @@ -618,14 +639,14 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> { } /// Used by wal_backup_partial. -pub async fn delete_objects(paths: &[RemotePath]) -> Result<()> { +pub async fn delete_objects(storage: &GenericRemoteStorage, paths: &[RemotePath]) -> Result<()> { let cancel = CancellationToken::new(); // not really used - let storage = get_configured_remote_storage(); storage.delete_objects(paths, &cancel).await } /// Copy segments from one timeline to another. Used in copy_timeline. pub async fn copy_s3_segments( + storage: &GenericRemoteStorage, wal_seg_size: usize, src_ttid: &TenantTimelineId, dst_ttid: &TenantTimelineId, @@ -634,12 +655,6 @@ pub async fn copy_s3_segments( ) -> Result<()> { const SEGMENTS_PROGRESS_REPORT_INTERVAL: u64 = 1024; - let storage = REMOTE_STORAGE - .get() - .expect("failed to get remote storage") - .as_ref() - .unwrap(); - let remote_dst_path = remote_timeline_path(dst_ttid)?; let cancel = CancellationToken::new(); diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 049852a048..fe0f1b3607 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -19,9 +19,11 @@ //! file. Code updates state in the control file before doing any S3 operations. //! This way control file stores information about all potentially existing //! remote partial segments and can clean them up after uploading a newer version. +use std::sync::Arc; + use camino::Utf8PathBuf; use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo}; -use remote_storage::RemotePath; +use remote_storage::{GenericRemoteStorage, RemotePath}; use safekeeper_api::Term; use serde::{Deserialize, Serialize}; use tokio_util::sync::CancellationToken; @@ -154,12 +156,16 @@ pub struct PartialBackup { conf: SafeKeeperConf, local_prefix: Utf8PathBuf, remote_timeline_path: RemotePath, - + storage: Arc, state: State, } impl PartialBackup { - pub async fn new(tli: WalResidentTimeline, conf: SafeKeeperConf) -> PartialBackup { + pub async fn new( + tli: WalResidentTimeline, + conf: SafeKeeperConf, + storage: Arc, + ) -> PartialBackup { let (_, persistent_state) = tli.get_state().await; let wal_seg_size = tli.get_wal_seg_size().await; @@ -173,6 +179,7 @@ impl PartialBackup { conf, local_prefix, remote_timeline_path, + storage, } } @@ -240,7 +247,8 @@ impl PartialBackup { let remote_path = prepared.remote_path(&self.remote_timeline_path); // Upload first `backup_bytes` bytes of the segment to the remote storage. - wal_backup::backup_partial_segment(&local_path, &remote_path, backup_bytes).await?; + wal_backup::backup_partial_segment(&self.storage, &local_path, &remote_path, backup_bytes) + .await?; PARTIAL_BACKUP_UPLOADED_BYTES.inc_by(backup_bytes as u64); // We uploaded the segment, now let's verify that the data is still actual. @@ -326,7 +334,7 @@ impl PartialBackup { let remote_path = self.remote_timeline_path.join(seg); objects_to_delete.push(remote_path); } - wal_backup::delete_objects(&objects_to_delete).await + wal_backup::delete_objects(&self.storage, &objects_to_delete).await } /// Delete all non-Uploaded segments from the remote storage. There should be only one @@ -424,6 +432,7 @@ pub async fn main_task( conf: SafeKeeperConf, limiter: RateLimiter, cancel: CancellationToken, + storage: Arc, ) -> Option { debug!("started"); let await_duration = conf.partial_backup_timeout; @@ -432,7 +441,7 @@ pub async fn main_task( let mut commit_lsn_rx = tli.get_commit_lsn_watch_rx(); let mut flush_lsn_rx = tli.get_term_flush_lsn_watch_rx(); - let mut backup = PartialBackup::new(tli, conf).await; + let mut backup = PartialBackup::new(tli, conf, storage).await; debug!("state: {:?}", backup.state); diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index f0bac4b40a..8ba3e7cc47 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -21,6 +21,7 @@ use postgres_ffi::waldecoder::WalStreamDecoder; use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo, dispatch_pgversion}; use pq_proto::SystemId; use remote_storage::RemotePath; +use std::sync::Arc; use tokio::fs::{self, File, OpenOptions, remove_file}; use tokio::io::{AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; use tracing::*; @@ -32,7 +33,7 @@ use crate::metrics::{ REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, WalStorageMetrics, time_io_closure, }; use crate::state::TimelinePersistentState; -use crate::wal_backup::{read_object, remote_timeline_path}; +use crate::wal_backup::{WalBackup, read_object, remote_timeline_path}; pub trait Storage { // Last written LSN. @@ -645,7 +646,7 @@ pub struct WalReader { wal_segment: Option>>, // S3 will be used to read WAL if LSN is not available locally - enable_remote_read: bool, + wal_backup: Arc, // We don't have WAL locally if LSN is less than local_start_lsn local_start_lsn: Lsn, @@ -664,7 +665,7 @@ impl WalReader { timeline_dir: Utf8PathBuf, state: &TimelinePersistentState, start_pos: Lsn, - enable_remote_read: bool, + wal_backup: Arc, ) -> Result { if state.server.wal_seg_size == 0 || state.local_start_lsn == Lsn(0) { bail!("state uninitialized, no data to read"); @@ -693,7 +694,7 @@ impl WalReader { wal_seg_size: state.server.wal_seg_size as usize, pos: start_pos, wal_segment: None, - enable_remote_read, + wal_backup, local_start_lsn: state.local_start_lsn, timeline_start_lsn: state.timeline_start_lsn, pg_version: state.server.pg_version / 10000, @@ -812,9 +813,9 @@ impl WalReader { } // Try to open remote file, if remote reads are enabled - if self.enable_remote_read { + if let Some(storage) = self.wal_backup.get_storage() { let remote_wal_file_path = self.remote_path.join(&wal_file_name); - return read_object(&remote_wal_file_path, xlogoff as u64).await; + return read_object(&storage, &remote_wal_file_path, xlogoff as u64).await; } bail!("WAL segment is not found") diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 649113b8ce..02c02c0e7f 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -31,7 +31,7 @@ use pageserver_api::models::{ }; use pageserver_api::shard::TenantShardId; use pageserver_api::upcall_api::{ - PutTimelineImportStatusRequest, ReAttachRequest, ValidateRequest, + PutTimelineImportStatusRequest, ReAttachRequest, TimelineImportStatusRequest, ValidateRequest, }; use pageserver_client::{BlockUnblock, mgmt_api}; use routerify::Middleware; @@ -157,6 +157,29 @@ async fn handle_validate(req: Request) -> Result, ApiError> json_response(StatusCode::OK, state.service.validate(validate_req).await?) } +async fn handle_get_timeline_import_status(req: Request) -> Result, ApiError> { + check_permissions(&req, Scope::GenerationsApi)?; + + let mut req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let get_req = json_request::(&mut req).await?; + + let state = get_state(&req); + + json_response( + StatusCode::OK, + state + .service + .handle_timeline_shard_import_progress(get_req) + .await?, + ) +} + async fn handle_put_timeline_import_status(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::GenerationsApi)?; @@ -2008,6 +2031,13 @@ pub fn make_router( .post("/upcall/v1/validate", |r| { named_request_span(r, handle_validate, RequestName("upcall_v1_validate")) }) + .get("/upcall/v1/timeline_import_status", |r| { + named_request_span( + r, + handle_get_timeline_import_status, + RequestName("upcall_v1_timeline_import_status"), + ) + }) .post("/upcall/v1/timeline_import_status", |r| { named_request_span( r, diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs index 554ca375f5..817409e112 100644 --- a/storage_controller/src/pageserver_client.rs +++ b/storage_controller/src/pageserver_client.rs @@ -1,3 +1,5 @@ +use std::time::Duration; + use pageserver_api::models::detach_ancestor::AncestorDetached; use pageserver_api::models::{ DetachBehavior, LocationConfig, LocationConfigListResponse, LsnLease, PageserverUtilization, @@ -212,6 +214,7 @@ impl PageserverClient { ) } + #[allow(unused)] pub(crate) async fn timeline_detail( &self, tenant_shard_id: TenantShardId, @@ -357,4 +360,20 @@ impl PageserverClient { self.inner.wait_lsn(tenant_shard_id, request).await ) } + + pub(crate) async fn activate_post_import( + &self, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + timeline_activate_timeout: Duration, + ) -> Result { + measured_request!( + "activate_post_import", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .activate_post_import(tenant_shard_id, timeline_id, timeline_activate_timeout) + .await + ) + } } diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 9ffcf9b9e6..052c0f02eb 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -1666,6 +1666,39 @@ impl Persistence { } } + pub(crate) async fn get_timeline_import( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> DatabaseResult> { + use crate::schema::timeline_imports::dsl; + let persistent_import = self + .with_measured_conn(DatabaseOperation::ListTimelineImports, move |conn| { + Box::pin(async move { + let mut from_db: Vec = dsl::timeline_imports + .filter(dsl::tenant_id.eq(tenant_id.to_string())) + .filter(dsl::timeline_id.eq(timeline_id.to_string())) + .load(conn) + .await?; + + if from_db.len() > 1 { + return Err(DatabaseError::Logical(format!( + "unexpected number of rows ({})", + from_db.len() + ))); + } + + Ok(from_db.pop()) + }) + }) + .await?; + + persistent_import + .map(TimelineImport::from_persistent) + .transpose() + .map_err(|err| DatabaseError::Logical(format!("failed to deserialize import: {err}"))) + } + pub(crate) async fn delete_timeline_import( &self, tenant_id: TenantId, diff --git a/storage_controller/src/safekeeper_client.rs b/storage_controller/src/safekeeper_client.rs index 988159af4a..1f3ea96d96 100644 --- a/storage_controller/src/safekeeper_client.rs +++ b/storage_controller/src/safekeeper_client.rs @@ -98,6 +98,23 @@ impl SafekeeperClient { ) } + #[allow(unused)] + pub(crate) async fn switch_timeline_membership( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + req: &models::TimelineMembershipSwitchRequest, + ) -> Result { + measured_request!( + "switch_timeline_membership", + crate::metrics::Method::Put, + &self.node_id_label, + self.inner + .switch_timeline_membership(tenant_id, timeline_id, req) + .await + ) + } + pub(crate) async fn delete_tenant( &self, tenant_id: TenantId, diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 3d5f36fb98..773373391e 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -628,11 +628,7 @@ impl Scheduler { tracing::trace!(%node_id, "attached_shard_count={} shard_count={} expected={}", node.attached_shard_count, node.shard_count, expected_attached_shards_per_node); } - if node.attached_shard_count < expected_attached_shards_per_node { - expected_attached_shards_per_node - node.attached_shard_count - } else { - 0 - } + expected_attached_shards_per_node.saturating_sub(node.attached_shard_count) } pub(crate) fn expected_attached_shard_count(&self) -> usize { diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 21c693af97..7e4bb627af 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -35,19 +35,19 @@ use pageserver_api::controller_api::{ }; use pageserver_api::models::{ self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease, - PageserverUtilization, SecondaryProgress, ShardParameters, TenantConfig, + PageserverUtilization, SecondaryProgress, ShardImportStatus, ShardParameters, TenantConfig, TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, TenantShardSplitResponse, TenantSorting, TenantTimeTravelRequest, TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateResponseStorcon, - TimelineInfo, TimelineState, TopTenantShardItem, TopTenantShardsRequest, + TimelineInfo, TopTenantShardItem, TopTenantShardsRequest, }; use pageserver_api::shard::{ DEFAULT_STRIPE_SIZE, ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId, }; use pageserver_api::upcall_api::{ PutTimelineImportStatusRequest, ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, - ValidateRequest, ValidateResponse, ValidateResponseTenant, + TimelineImportStatusRequest, ValidateRequest, ValidateResponse, ValidateResponseTenant, }; use pageserver_client::{BlockUnblock, mgmt_api}; use reqwest::{Certificate, StatusCode}; @@ -61,6 +61,7 @@ use utils::completion::Barrier; use utils::generation::Generation; use utils::id::{NodeId, TenantId, TimelineId}; use utils::lsn::Lsn; +use utils::shard::ShardIndex; use utils::sync::gate::{Gate, GateGuard}; use utils::{failpoint_support, pausable_failpoint}; @@ -98,7 +99,8 @@ use crate::tenant_shard::{ ScheduleOptimization, ScheduleOptimizationAction, TenantShard, }; use crate::timeline_import::{ - ShardImportStatuses, TimelineImport, TimelineImportState, UpcallClient, + ImportResult, ShardImportStatuses, TimelineImport, TimelineImportFinalizeError, + TimelineImportState, UpcallClient, }; const WAITER_FILL_DRAIN_POLL_TIMEOUT: Duration = Duration::from_millis(500); @@ -192,6 +194,14 @@ pub(crate) enum LeadershipStatus { Candidate, } +enum ShardGenerationValidity { + Valid, + Mismatched { + claimed: Generation, + actual: Option, + }, +} + pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128; pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256; pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32; @@ -3886,10 +3896,10 @@ impl Service { None } else if safekeepers { - // Note that we do not support creating the timeline on the safekeepers - // for imported timelines. The `start_lsn` of the timeline is not known - // until the import finshes. - // https://github.com/neondatabase/neon/issues/11569 + // Note that for imported timelines, we do not create the timeline on the safekeepers + // straight away. Instead, we do it once the import finalized such that we know what + // start LSN to provide for the safekeepers. This is done in + // [`Self::finalize_timeline_import`]. let res = self .tenant_timeline_create_safekeepers(tenant_id, &timeline_info) .instrument(tracing::info_span!("timeline_create_safekeepers", %tenant_id, timeline_id=%timeline_info.timeline_id)) @@ -3905,10 +3915,77 @@ impl Service { }) } + pub(crate) async fn handle_timeline_shard_import_progress( + self: &Arc, + req: TimelineImportStatusRequest, + ) -> Result { + let validity = self + .validate_shard_generation(req.tenant_shard_id, req.generation) + .await?; + match validity { + ShardGenerationValidity::Valid => { + // fallthrough + } + ShardGenerationValidity::Mismatched { claimed, actual } => { + tracing::info!( + claimed=?claimed.into(), + actual=?actual.and_then(|g| g.into()), + "Rejecting import progress fetch from stale generation" + ); + + return Err(ApiError::BadRequest(anyhow::anyhow!("Invalid generation"))); + } + } + + let maybe_import = self + .persistence + .get_timeline_import(req.tenant_shard_id.tenant_id, req.timeline_id) + .await?; + + let import = maybe_import.ok_or_else(|| { + ApiError::NotFound( + format!( + "import for {}/{} not found", + req.tenant_shard_id.tenant_id, req.timeline_id + ) + .into(), + ) + })?; + + import + .shard_statuses + .0 + .get(&req.tenant_shard_id.to_index()) + .cloned() + .ok_or_else(|| { + ApiError::NotFound( + format!("shard {} not found", req.tenant_shard_id.shard_slug()).into(), + ) + }) + } + pub(crate) async fn handle_timeline_shard_import_progress_upcall( self: &Arc, req: PutTimelineImportStatusRequest, ) -> Result<(), ApiError> { + let validity = self + .validate_shard_generation(req.tenant_shard_id, req.generation) + .await?; + match validity { + ShardGenerationValidity::Valid => { + // fallthrough + } + ShardGenerationValidity::Mismatched { claimed, actual } => { + tracing::info!( + claimed=?claimed.into(), + actual=?actual.and_then(|g| g.into()), + "Rejecting import progress update from stale generation" + ); + + return Err(ApiError::PreconditionFailed("Invalid generation".into())); + } + } + let res = self .persistence .update_timeline_import(req.tenant_shard_id, req.timeline_id, req.status) @@ -3943,55 +4020,147 @@ impl Service { Ok(()) } + /// Check that a provided generation for some tenant shard is the most recent one. + /// + /// Validate with the in-mem state first, and, if that passes, validate with the + /// database state which is authoritative. + async fn validate_shard_generation( + self: &Arc, + tenant_shard_id: TenantShardId, + generation: Generation, + ) -> Result { + { + let locked = self.inner.read().unwrap(); + let tenant_shard = + locked + .tenants + .get(&tenant_shard_id) + .ok_or(ApiError::InternalServerError(anyhow::anyhow!( + "{} shard not found", + tenant_shard_id + )))?; + + if tenant_shard.generation != Some(generation) { + return Ok(ShardGenerationValidity::Mismatched { + claimed: generation, + actual: tenant_shard.generation, + }); + } + } + + let mut db_generations = self + .persistence + .shard_generations(std::iter::once(&tenant_shard_id)) + .await?; + let (_tid, db_generation) = + db_generations + .pop() + .ok_or(ApiError::InternalServerError(anyhow::anyhow!( + "{} shard not found", + tenant_shard_id + )))?; + + if db_generation != Some(generation) { + return Ok(ShardGenerationValidity::Mismatched { + claimed: generation, + actual: db_generation, + }); + } + + Ok(ShardGenerationValidity::Valid) + } + + /// Finalize the import of a timeline + /// + /// This method should be called once all shards have reported that the import is complete. + /// Firstly, it polls the post import timeline activation endpoint exposed by the pageserver. + /// Once the timeline is active on all shards, the timeline also gets created on the + /// safekeepers. Finally, notify cplane of the import completion (whether failed or + /// successful), and remove the import from the database and in-memory. + /// + /// If this method gets pre-empted by shut down, it will be called again at start-up (on-going + /// imports are stored in the database). #[instrument(skip_all, fields( tenant_id=%import.tenant_id, - shard_id=%import.timeline_id, + timeline_id=%import.timeline_id, ))] async fn finalize_timeline_import( self: &Arc, import: TimelineImport, - ) -> anyhow::Result<()> { + ) -> Result<(), TimelineImportFinalizeError> { tracing::info!("Finalizing timeline import"); pausable_failpoint!("timeline-import-pre-cplane-notification"); - let import_failed = import.completion_error().is_some(); + let tenant_id = import.tenant_id; + let timeline_id = import.timeline_id; - if !import_failed { - loop { - if self.cancel.is_cancelled() { - anyhow::bail!("Shut down requested while finalizing import"); - } - - let active = self.timeline_active_on_all_shards(&import).await?; - - match active { - true => { - tracing::info!("Timeline became active on all shards"); - break; - } - false => { - tracing::info!("Timeline not active on all shards yet"); - - tokio::select! { - _ = self.cancel.cancelled() => { - anyhow::bail!("Shut down requested while finalizing import"); - }, - _ = tokio::time::sleep(Duration::from_secs(5)) => {} - }; - } - } + let import_error = import.completion_error(); + match import_error { + Some(err) => { + self.notify_cplane_and_delete_import(tenant_id, timeline_id, Err(err)) + .await?; + tracing::warn!("Timeline import completed with shard errors"); + Ok(()) } - } + None => match self.activate_timeline_post_import(&import).await { + Ok(timeline_info) => { + tracing::info!("Post import timeline activation complete"); + if self.config.timelines_onto_safekeepers { + // Now that we know the start LSN of this timeline, create it on the + // safekeepers. + self.tenant_timeline_create_safekeepers_until_success( + import.tenant_id, + timeline_info, + ) + .await?; + } + + self.notify_cplane_and_delete_import(tenant_id, timeline_id, Ok(())) + .await?; + + tracing::info!("Timeline import completed successfully"); + Ok(()) + } + Err(TimelineImportFinalizeError::ShuttingDown) => { + // We got pre-empted by shut down and will resume after the restart. + Err(TimelineImportFinalizeError::ShuttingDown) + } + Err(err) => { + // Any finalize error apart from shut down is permanent and requires us to notify + // cplane such that it can clean up. + tracing::error!("Import finalize failed with permanent error: {err}"); + self.notify_cplane_and_delete_import( + tenant_id, + timeline_id, + Err(err.to_string()), + ) + .await?; + Err(err) + } + }, + } + } + + async fn notify_cplane_and_delete_import( + self: &Arc, + tenant_id: TenantId, + timeline_id: TimelineId, + import_result: ImportResult, + ) -> Result<(), TimelineImportFinalizeError> { + let import_failed = import_result.is_err(); tracing::info!(%import_failed, "Notifying cplane of import completion"); let client = UpcallClient::new(self.get_config(), self.cancel.child_token()); - client.notify_import_complete(&import).await?; + client + .notify_import_complete(tenant_id, timeline_id, import_result) + .await + .map_err(|_err| TimelineImportFinalizeError::ShuttingDown)?; if let Err(err) = self .persistence - .delete_timeline_import(import.tenant_id, import.timeline_id) + .delete_timeline_import(tenant_id, timeline_id) .await { tracing::warn!("Failed to delete timeline import entry from database: {err}"); @@ -4001,17 +4170,113 @@ impl Service { .write() .unwrap() .tenants - .range_mut(TenantShardId::tenant_range(import.tenant_id)) + .range_mut(TenantShardId::tenant_range(tenant_id)) .for_each(|(_id, shard)| shard.importing = TimelineImportState::Idle); - // TODO(vlad): Timeline creations in import mode do not return a correct initdb lsn, - // so we can't create the timeline on the safekeepers. Fix by moving creation here. - // https://github.com/neondatabase/neon/issues/11569 - tracing::info!(%import_failed, "Timeline import complete"); - Ok(()) } + /// Activate an imported timeline on all shards once the import is complete. + /// Returns the [`TimelineInfo`] reported by shard zero. + async fn activate_timeline_post_import( + self: &Arc, + import: &TimelineImport, + ) -> Result { + const TIMELINE_ACTIVATE_TIMEOUT: Duration = Duration::from_millis(128); + + let mut shards_to_activate: HashSet = + import.shard_statuses.0.keys().cloned().collect(); + let mut shard_zero_timeline_info = None; + + while !shards_to_activate.is_empty() { + if self.cancel.is_cancelled() { + return Err(TimelineImportFinalizeError::ShuttingDown); + } + + let targets = { + let locked = self.inner.read().unwrap(); + let mut targets = Vec::new(); + + for (tenant_shard_id, shard) in locked + .tenants + .range(TenantShardId::tenant_range(import.tenant_id)) + { + if !import + .shard_statuses + .0 + .contains_key(&tenant_shard_id.to_index()) + { + return Err(TimelineImportFinalizeError::MismatchedShards( + tenant_shard_id.to_index(), + )); + } + + if let Some(node_id) = shard.intent.get_attached() { + let node = locked + .nodes + .get(node_id) + .expect("Pageservers may not be deleted while referenced"); + targets.push((*tenant_shard_id, node.clone())); + } + } + + targets + }; + + let targeted_tenant_shards: Vec<_> = targets.iter().map(|(tid, _node)| *tid).collect(); + + let results = self + .tenant_for_shards_api( + targets, + |tenant_shard_id, client| async move { + client + .activate_post_import( + tenant_shard_id, + import.timeline_id, + TIMELINE_ACTIVATE_TIMEOUT, + ) + .await + }, + 1, + 1, + SHORT_RECONCILE_TIMEOUT, + &self.cancel, + ) + .await; + + let mut failed = 0; + for (tid, result) in targeted_tenant_shards.iter().zip(results.into_iter()) { + match result { + Ok(ok) => { + if tid.is_shard_zero() { + shard_zero_timeline_info = Some(ok); + } + + shards_to_activate.remove(&tid.to_index()); + } + Err(_err) => { + failed += 1; + } + } + } + + if failed > 0 { + tracing::info!( + "Failed to activate timeline on {failed} shards post import. Will retry" + ); + } + + tokio::select! { + _ = tokio::time::sleep(Duration::from_millis(250)) => {}, + _ = self.cancel.cancelled() => { + return Err(TimelineImportFinalizeError::ShuttingDown); + } + } + } + + Ok(shard_zero_timeline_info.expect("All shards replied")) + } + async fn finalize_timeline_imports(self: &Arc, imports: Vec) { futures::future::join_all( imports @@ -4021,61 +4286,6 @@ impl Service { .await; } - async fn timeline_active_on_all_shards( - self: &Arc, - import: &TimelineImport, - ) -> anyhow::Result { - let targets = { - let locked = self.inner.read().unwrap(); - let mut targets = Vec::new(); - - for (tenant_shard_id, shard) in locked - .tenants - .range(TenantShardId::tenant_range(import.tenant_id)) - { - if !import - .shard_statuses - .0 - .contains_key(&tenant_shard_id.to_index()) - { - anyhow::bail!("Shard layout change detected on completion"); - } - - if let Some(node_id) = shard.intent.get_attached() { - let node = locked - .nodes - .get(node_id) - .expect("Pageservers may not be deleted while referenced"); - targets.push((*tenant_shard_id, node.clone())); - } else { - return Ok(false); - } - } - - targets - }; - - let results = self - .tenant_for_shards_api( - targets, - |tenant_shard_id, client| async move { - client - .timeline_detail(tenant_shard_id, import.timeline_id) - .await - }, - 1, - 1, - SHORT_RECONCILE_TIMEOUT, - &self.cancel, - ) - .await; - - Ok(results.into_iter().all(|res| match res { - Ok(info) => info.state == TimelineState::Active, - Err(_) => false, - })) - } - pub(crate) async fn tenant_timeline_archival_config( &self, tenant_id: TenantId, @@ -8485,7 +8695,7 @@ impl Service { // By default, live migrations are generous about the wait time for getting // the secondary location up to speed. When draining, give up earlier in order // to not stall the operation when a cold secondary is encountered. - const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); + const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(30); const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal) .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) @@ -8818,7 +9028,7 @@ impl Service { node_id: NodeId, cancel: CancellationToken, ) -> Result<(), OperationError> { - const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(20); + const SECONDARY_WARMUP_TIMEOUT: Duration = Duration::from_secs(30); const SECONDARY_DOWNLOAD_REQUEST_TIMEOUT: Duration = Duration::from_secs(5); let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal) .secondary_warmup_timeout(SECONDARY_WARMUP_TIMEOUT) diff --git a/storage_controller/src/service/safekeeper_reconciler.rs b/storage_controller/src/service/safekeeper_reconciler.rs index 71c73a0112..f756d98c64 100644 --- a/storage_controller/src/service/safekeeper_reconciler.rs +++ b/storage_controller/src/service/safekeeper_reconciler.rs @@ -1,4 +1,9 @@ -use std::{collections::HashMap, str::FromStr, sync::Arc, time::Duration}; +use std::{ + collections::HashMap, + str::FromStr, + sync::{Arc, atomic::AtomicU64}, + time::Duration, +}; use clashmap::{ClashMap, Entry}; use safekeeper_api::models::PullTimelineRequest; @@ -169,10 +174,17 @@ pub(crate) struct ScheduleRequest { pub(crate) kind: SafekeeperTimelineOpKind, } +/// A way to keep ongoing/queued reconcile requests apart +#[derive(Copy, Clone, PartialEq, Eq)] +struct TokenId(u64); + +type OngoingTokens = ClashMap<(TenantId, Option), (CancellationToken, TokenId)>; + /// Handle to per safekeeper reconciler. struct ReconcilerHandle { - tx: UnboundedSender<(ScheduleRequest, CancellationToken)>, - ongoing_tokens: Arc), CancellationToken>>, + tx: UnboundedSender<(ScheduleRequest, CancellationToken, TokenId)>, + ongoing_tokens: Arc, + token_id_counter: AtomicU64, cancel: CancellationToken, } @@ -185,24 +197,28 @@ impl ReconcilerHandle { &self, tenant_id: TenantId, timeline_id: Option, - ) -> CancellationToken { + ) -> (CancellationToken, TokenId) { + let token_id = self + .token_id_counter + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + let token_id = TokenId(token_id); let entry = self.ongoing_tokens.entry((tenant_id, timeline_id)); if let Entry::Occupied(entry) = &entry { - let cancel: &CancellationToken = entry.get(); + let (cancel, _) = entry.get(); cancel.cancel(); } - entry.insert(self.cancel.child_token()).clone() + entry.insert((self.cancel.child_token(), token_id)).clone() } /// Cancel an ongoing reconciliation fn cancel_reconciliation(&self, tenant_id: TenantId, timeline_id: Option) { - if let Some((_, cancel)) = self.ongoing_tokens.remove(&(tenant_id, timeline_id)) { + if let Some((_, (cancel, _id))) = self.ongoing_tokens.remove(&(tenant_id, timeline_id)) { cancel.cancel(); } } fn schedule_reconcile(&self, req: ScheduleRequest) { - let cancel = self.new_token_slot(req.tenant_id, req.timeline_id); + let (cancel, token_id) = self.new_token_slot(req.tenant_id, req.timeline_id); let hostname = req.safekeeper.skp.host.clone(); - if let Err(err) = self.tx.send((req, cancel)) { + if let Err(err) = self.tx.send((req, cancel, token_id)) { tracing::info!("scheduling request onto {hostname} returned error: {err}"); } } @@ -211,13 +227,14 @@ impl ReconcilerHandle { pub(crate) struct SafekeeperReconciler { inner: SafekeeperReconcilerInner, concurrency_limiter: Arc, - rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>, + rx: UnboundedReceiver<(ScheduleRequest, CancellationToken, TokenId)>, cancel: CancellationToken, } /// Thin wrapper over `Service` to not clutter its inherent functions #[derive(Clone)] struct SafekeeperReconcilerInner { + ongoing_tokens: Arc, service: Arc, } @@ -226,15 +243,20 @@ impl SafekeeperReconciler { // We hold the ServiceInner lock so we don't want to make sending to the reconciler channel to be blocking. let (tx, rx) = mpsc::unbounded_channel(); let concurrency = service.config.safekeeper_reconciler_concurrency; + let ongoing_tokens = Arc::new(ClashMap::new()); let mut reconciler = SafekeeperReconciler { - inner: SafekeeperReconcilerInner { service }, + inner: SafekeeperReconcilerInner { + service, + ongoing_tokens: ongoing_tokens.clone(), + }, rx, concurrency_limiter: Arc::new(Semaphore::new(concurrency)), cancel: cancel.clone(), }; let handle = ReconcilerHandle { tx, - ongoing_tokens: Arc::new(ClashMap::new()), + ongoing_tokens, + token_id_counter: AtomicU64::new(0), cancel, }; tokio::spawn(async move { reconciler.run().await }); @@ -246,7 +268,9 @@ impl SafekeeperReconciler { req = self.rx.recv() => req, _ = self.cancel.cancelled() => break, }; - let Some((req, req_cancel)) = req else { break }; + let Some((req, req_cancel, req_token_id)) = req else { + break; + }; let permit_res = tokio::select! { req = self.concurrency_limiter.clone().acquire_owned() => req, @@ -265,7 +289,7 @@ impl SafekeeperReconciler { let timeline_id = req.timeline_id; let node_id = req.safekeeper.skp.id; inner - .reconcile_one(req, req_cancel) + .reconcile_one(req, req_cancel, req_token_id) .instrument(tracing::info_span!( "reconcile_one", ?kind, @@ -280,8 +304,14 @@ impl SafekeeperReconciler { } impl SafekeeperReconcilerInner { - async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) { + async fn reconcile_one( + &self, + req: ScheduleRequest, + req_cancel: CancellationToken, + req_token_id: TokenId, + ) { let req_host = req.safekeeper.skp.host.clone(); + let success; match req.kind { SafekeeperTimelineOpKind::Pull => { let Some(timeline_id) = req.timeline_id else { @@ -301,20 +331,24 @@ impl SafekeeperReconcilerInner { http_hosts, tenant_id: req.tenant_id, timeline_id, + ignore_tombstone: Some(false), }; - self.reconcile_inner( - req, - async |client| client.pull_timeline(&pull_req).await, - |resp| { - if let Some(host) = resp.safekeeper_host { - tracing::info!("pulled timeline from {host} onto {req_host}"); - } else { - tracing::info!("timeline already present on safekeeper on {req_host}"); - } - }, - req_cancel, - ) - .await; + success = self + .reconcile_inner( + &req, + async |client| client.pull_timeline(&pull_req).await, + |resp| { + if let Some(host) = resp.safekeeper_host { + tracing::info!("pulled timeline from {host} onto {req_host}"); + } else { + tracing::info!( + "timeline already present on safekeeper on {req_host}" + ); + } + }, + req_cancel, + ) + .await; } SafekeeperTimelineOpKind::Exclude => { // TODO actually exclude instead of delete here @@ -325,22 +359,23 @@ impl SafekeeperReconcilerInner { ); return; }; - self.reconcile_inner( - req, - async |client| client.delete_timeline(tenant_id, timeline_id).await, - |_resp| { - tracing::info!("deleted timeline from {req_host}"); - }, - req_cancel, - ) - .await; + success = self + .reconcile_inner( + &req, + async |client| client.delete_timeline(tenant_id, timeline_id).await, + |_resp| { + tracing::info!("deleted timeline from {req_host}"); + }, + req_cancel, + ) + .await; } SafekeeperTimelineOpKind::Delete => { let tenant_id = req.tenant_id; if let Some(timeline_id) = req.timeline_id { - let deleted = self + success = self .reconcile_inner( - req, + &req, async |client| client.delete_timeline(tenant_id, timeline_id).await, |_resp| { tracing::info!("deleted timeline from {req_host}"); @@ -348,13 +383,13 @@ impl SafekeeperReconcilerInner { req_cancel, ) .await; - if deleted { + if success { self.delete_timeline_from_db(tenant_id, timeline_id).await; } } else { - let deleted = self + success = self .reconcile_inner( - req, + &req, async |client| client.delete_tenant(tenant_id).await, |_resp| { tracing::info!(%tenant_id, "deleted tenant from {req_host}"); @@ -362,12 +397,21 @@ impl SafekeeperReconcilerInner { req_cancel, ) .await; - if deleted { + if success { self.delete_tenant_timelines_from_db(tenant_id).await; } } } } + if success { + self.ongoing_tokens.remove_if( + &(req.tenant_id, req.timeline_id), + |_ttid, (_cancel, token_id)| { + // Ensure that this request is indeed the request we just finished and not a new one + req_token_id == *token_id + }, + ); + } } async fn delete_timeline_from_db(&self, tenant_id: TenantId, timeline_id: TimelineId) { match self @@ -421,10 +465,10 @@ impl SafekeeperReconcilerInner { self.delete_timeline_from_db(tenant_id, timeline_id).await; } } - /// Returns whether the reconciliation happened successfully + /// Returns whether the reconciliation happened successfully (or we got cancelled) async fn reconcile_inner( &self, - req: ScheduleRequest, + req: &ScheduleRequest, closure: impl Fn(SafekeeperClient) -> F, log_success: impl FnOnce(T) -> U, req_cancel: CancellationToken, diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs index 5eecf0d415..cd5ace449d 100644 --- a/storage_controller/src/service/safekeeper_service.rs +++ b/storage_controller/src/service/safekeeper_service.rs @@ -10,6 +10,7 @@ use crate::persistence::{ DatabaseError, SafekeeperTimelineOpKind, TimelinePendingOpPersistence, TimelinePersistence, }; use crate::safekeeper::Safekeeper; +use crate::timeline_import::TimelineImportFinalizeError; use anyhow::Context; use http_utils::error::ApiError; use pageserver_api::controller_api::{ @@ -323,6 +324,42 @@ impl Service { }) } + pub(crate) async fn tenant_timeline_create_safekeepers_until_success( + self: &Arc, + tenant_id: TenantId, + timeline_info: TimelineInfo, + ) -> Result<(), TimelineImportFinalizeError> { + const BACKOFF: Duration = Duration::from_secs(5); + + loop { + if self.cancel.is_cancelled() { + return Err(TimelineImportFinalizeError::ShuttingDown); + } + + let res = self + .tenant_timeline_create_safekeepers(tenant_id, &timeline_info) + .await; + + match res { + Ok(_) => { + tracing::info!("Timeline created on safekeepers"); + break; + } + Err(err) => { + tracing::error!("Failed to create timeline on safekeepers: {err}"); + tokio::select! { + _ = self.cancel.cancelled() => { + return Err(TimelineImportFinalizeError::ShuttingDown); + }, + _ = tokio::time::sleep(BACKOFF) => {} + }; + } + } + } + + Ok(()) + } + /// Directly insert the timeline into the database without reconciling it with safekeepers. /// /// Useful if the timeline already exists on the specified safekeepers, diff --git a/storage_controller/src/timeline_import.rs b/storage_controller/src/timeline_import.rs index 6dcc538c4b..909e8e2899 100644 --- a/storage_controller/src/timeline_import.rs +++ b/storage_controller/src/timeline_import.rs @@ -5,7 +5,7 @@ use http_utils::error::ApiError; use reqwest::Method; use serde::{Deserialize, Serialize}; -use pageserver_api::models::ShardImportStatus; +use pageserver_api::models::{ShardImportProgress, ShardImportStatus}; use tokio_util::sync::CancellationToken; use utils::{ id::{TenantId, TimelineId}, @@ -28,7 +28,12 @@ impl ShardImportStatuses { ShardImportStatuses( shards .into_iter() - .map(|ts_id| (ts_id, ShardImportStatus::InProgress)) + .map(|ts_id| { + ( + ts_id, + ShardImportStatus::InProgress(None::), + ) + }) .collect(), ) } @@ -46,6 +51,14 @@ pub(crate) enum TimelineImportUpdateFollowUp { None, } +#[derive(thiserror::Error, Debug)] +pub(crate) enum TimelineImportFinalizeError { + #[error("Shut down interrupted import finalize")] + ShuttingDown, + #[error("Mismatched shard detected during import finalize: {0}")] + MismatchedShards(ShardIndex), +} + pub(crate) enum TimelineImportUpdateError { ImportNotFound { tenant_id: TenantId, @@ -151,6 +164,8 @@ impl TimelineImport { } } +pub(crate) type ImportResult = Result<(), String>; + pub(crate) struct UpcallClient { authorization_header: Option, client: reqwest::Client, @@ -198,7 +213,9 @@ impl UpcallClient { /// eventual cplane availability. The cplane API is idempotent. pub(crate) async fn notify_import_complete( &self, - import: &TimelineImport, + tenant_id: TenantId, + timeline_id: TimelineId, + import_result: ImportResult, ) -> anyhow::Result<()> { let endpoint = if self.base_url.ends_with('/') { format!("{}import_complete", self.base_url) @@ -206,15 +223,13 @@ impl UpcallClient { format!("{}/import_complete", self.base_url) }; - tracing::info!("Endpoint is {endpoint}"); - let request = self .client .request(Method::PUT, endpoint) .json(&ImportCompleteRequest { - tenant_id: import.tenant_id, - timeline_id: import.timeline_id, - error: import.completion_error(), + tenant_id, + timeline_id, + error: import_result.err(), }) .timeout(IMPORT_COMPLETE_REQUEST_TIMEOUT); diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index b151b612bf..865f0908f9 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -13,7 +13,7 @@ use pageserver::tenant::remote_timeline_client::{ }; use pageserver::tenant::storage_layer::LayerName; use pageserver_api::shard::ShardIndex; -use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath}; +use remote_storage::{DownloadError, GenericRemoteStorage, ListingObject, RemotePath}; use tokio_util::sync::CancellationToken; use tracing::{info, warn}; use utils::generation::Generation; @@ -165,23 +165,34 @@ pub(crate) async fn branch_cleanup_and_check_errors( .head_object(&path, &CancellationToken::new()) .await; - if let Err(e) = response { - // Object is not present. - let is_l0 = LayerMap::is_l0(layer.key_range(), layer.is_delta()); + match response { + Ok(_) => {} + Err(DownloadError::NotFound) => { + // Object is not present. + let is_l0 = + LayerMap::is_l0(layer.key_range(), layer.is_delta()); - let msg = format!( - "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {}) with error: {}", - layer, - metadata.generation.get_suffix(), - metadata.shard, - is_l0, - e, - ); + let msg = format!( + "index_part.json contains a layer {}{} (shard {}) that is not present in remote storage (layer_is_l0: {})", + layer, + metadata.generation.get_suffix(), + metadata.shard, + is_l0, + ); - if is_l0 || ignore_error { - result.warnings.push(msg); - } else { - result.errors.push(msg); + if is_l0 || ignore_error { + result.warnings.push(msg); + } else { + result.errors.push(msg); + } + } + Err(e) => { + tracing::warn!( + "cannot check if the layer {}{} is present in remote storage (error: {})", + layer, + metadata.generation.get_suffix(), + e, + ); } } } @@ -355,6 +366,7 @@ pub(crate) async fn list_timeline_blobs( match res { ListTimelineBlobsResult::Ready(data) => Ok(data), ListTimelineBlobsResult::MissingIndexPart(_) => { + tracing::warn!("listing raced with removal of an index, retrying"); // Retry if listing raced with removal of an index let data = list_timeline_blobs_impl(remote_client, id, root_target) .await? @@ -441,7 +453,7 @@ async fn list_timeline_blobs_impl( } if index_part_keys.is_empty() && s3_layers.is_empty() { - tracing::debug!("Timeline is empty: expected post-deletion state."); + tracing::info!("Timeline is empty: expected post-deletion state."); if initdb_archive { tracing::info!("Timeline is post deletion but initdb archive is still present."); } diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index e1a4095a3c..49ab192285 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -593,6 +593,7 @@ async fn gc_timeline( index_part_snapshot_time: _, } => (index_part, *index_part_generation, data.unused_index_keys), BlobDataParseResult::Relic => { + tracing::info!("Skipping timeline {ttid}, it is a relic"); // Post-deletion tenant location: don't try and GC it. return Ok(summary); } diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs index 24231e32fc..d0ca53f8ab 100644 --- a/storage_scrubber/src/tenant_snapshot.rs +++ b/storage_scrubber/src/tenant_snapshot.rs @@ -24,7 +24,6 @@ pub struct SnapshotDownloader { remote_client: GenericRemoteStorage, #[allow(dead_code)] target: RootTarget, - bucket_config: BucketConfig, tenant_id: TenantId, output_path: Utf8PathBuf, concurrency: usize, @@ -43,7 +42,6 @@ impl SnapshotDownloader { Ok(Self { remote_client, target, - bucket_config, tenant_id, output_path, concurrency, @@ -218,11 +216,9 @@ impl SnapshotDownloader { } pub async fn download(&self) -> anyhow::Result<()> { - let (remote_client, target) = - init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?; - // Generate a stream of TenantShardId - let shards = stream_tenant_shards(&remote_client, &target, self.tenant_id).await?; + let shards = + stream_tenant_shards(&self.remote_client, &self.target, self.tenant_id).await?; let shards: Vec = shards.try_collect().await?; // Only read from shards that have the highest count: avoids redundantly downloading @@ -240,7 +236,8 @@ impl SnapshotDownloader { for shard in shards.into_iter().filter(|s| s.shard_count == shard_count) { // Generate a stream of TenantTimelineId - let timelines = stream_tenant_timelines(&remote_client, &target, shard).await?; + let timelines = + stream_tenant_timelines(&self.remote_client, &self.target, shard).await?; // Generate a stream of S3TimelineBlobData async fn load_timeline_index( @@ -251,8 +248,8 @@ impl SnapshotDownloader { let data = list_timeline_blobs(remote_client, ttid, target).await?; Ok((ttid, data)) } - let timelines = - timelines.map_ok(|ttid| load_timeline_index(&remote_client, &target, ttid)); + let timelines = timelines + .map_ok(|ttid| load_timeline_index(&self.remote_client, &self.target, ttid)); let mut timelines = std::pin::pin!(timelines.try_buffered(8)); while let Some(i) = timelines.next().await { diff --git a/test_runner/bin/neon_local_create_deep_l0_stack.py b/test_runner/bin/neon_local_create_deep_l0_stack.py new file mode 100644 index 0000000000..ebe11f7308 --- /dev/null +++ b/test_runner/bin/neon_local_create_deep_l0_stack.py @@ -0,0 +1,59 @@ +""" +Script to creates a stack of L0 deltas each of which should have 1 Value::Delta per page in `data`, +in your running neon_local setup. + +Use this bash setup to reset your neon_local environment. +The last line of this bash snippet will run this file here. +``` + export NEON_REPO_DIR=$PWD/.neon + export NEON_BIN_DIR=$PWD/target/release + $NEON_BIN_DIR/neon_local stop + rm -rf $NEON_REPO_DIR + $NEON_BIN_DIR/neon_local init + cat >> $NEON_REPO_DIR/pageserver_1/pageserver.toml <<"EOF" + # customizations + virtual_file_io_mode = "direct-rw" + page_service_pipelining={mode="pipelined", max_batch_size=32, execution="concurrent-futures"} + get_vectored_concurrent_io={mode="sidecar-task"} +EOF + $NEON_BIN_DIR/neon_local start + + psql 'postgresql://localhost:1235/storage_controller' -c 'DELETE FROM tenant_shards' + sed 's/.*get_vectored_concurrent_io.*/get_vectored_concurrent_io={mode="sidecar-task"}/' -i $NEON_REPO_DIR/pageserver_1/pageserver.toml + $NEON_BIN_DIR/neon_local pageserver restart + sleep 2 + $NEON_BIN_DIR/neon_local tenant create --set-default + ./target/debug/neon_local endpoint stop foo + rm -rf $NEON_REPO_DIR/endpoints/foo + ./target/debug/neon_local endpoint create foo + echo 'full_page_writes=off' >> $NEON_REPO_DIR/endpoints/foo/postgresql.conf + ./target/debug/neon_local endpoint start foo + + pushd test_runner; poetry run python3 -m bin.neon_local_create_deep_l0_stack 10; popd +``` +""" + +import sys + +import psycopg2 +from fixtures.common_types import TenantShardId, TimelineId +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.makelayers.l0stack import L0StackShape, make_l0_stack_standalone + +ps_http = PageserverHttpClient(port=9898, is_testing_enabled_or_skip=lambda: None) +vps_http = PageserverHttpClient(port=1234, is_testing_enabled_or_skip=lambda: None) + +tenants = ps_http.tenant_list() +assert len(tenants) == 1 +tenant_shard_id = TenantShardId.parse(tenants[0]["id"]) + +timlines = ps_http.timeline_list(tenant_shard_id) +assert len(timlines) == 1 +timeline_id = TimelineId(timlines[0]["timeline_id"]) + +connstr = "postgresql://cloud_admin@localhost:55432/postgres" +conn = psycopg2.connect(connstr) + +shape = L0StackShape(logical_table_size_mib=50, delta_stack_height=int(sys.argv[1])) + +make_l0_stack_standalone(vps_http, ps_http, tenant_shard_id, timeline_id, conn, shape) diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index 3be78719d7..bb07e2b6d1 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -103,7 +103,7 @@ class AbstractNeonCli: else: stdout = "" - log.warn(f"CLI timeout: stderr={stderr}, stdout={stdout}") + log.warning(f"CLI timeout: stderr={stderr}, stdout={stdout}") raise indent = " " @@ -557,7 +557,7 @@ class NeonLocalCli(AbstractNeonCli): endpoint_id: str, safekeepers_generation: int | None = None, safekeepers: list[int] | None = None, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, allow_multiple: bool = False, create_test_user: bool = False, @@ -572,8 +572,8 @@ class NeonLocalCli(AbstractNeonCli): extra_env_vars = env or {} if basebackup_request_tries is not None: extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries) - if remote_ext_config is not None: - args.extend(["--remote-ext-config", remote_ext_config]) + if remote_ext_base_url is not None: + args.extend(["--remote-ext-base-url", remote_ext_base_url]) if safekeepers_generation is not None: args.extend(["--safekeepers-generation", str(safekeepers_generation)]) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index d4a750ad3b..e413b3c6d2 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1255,6 +1255,12 @@ class NeonEnv: "no_sync": True, # Look for gaps in WAL received from safekeepeers "validate_wal_contiguity": True, + # TODO(vlad): make these configurable through the builder + "timeline_import_config": { + "import_job_concurrency": 4, + "import_job_soft_size_limit": 512 * 1024, + "import_job_checkpoint_threshold": 4, + }, } # Batching (https://github.com/neondatabase/neon/issues/9377): @@ -1274,6 +1280,8 @@ class NeonEnv: if self.pageserver_virtual_file_io_engine is not None: ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine + if self.pageserver_virtual_file_io_mode is not None: + ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode if config.pageserver_default_tenant_config_compaction_algorithm is not None: tenant_config = ps_cfg.setdefault("tenant_config", {}) tenant_config["compaction_algorithm"] = ( @@ -1299,13 +1307,6 @@ class NeonEnv: for key, value in override.items(): ps_cfg[key] = value - if self.pageserver_virtual_file_io_mode is not None: - # TODO(christian): https://github.com/neondatabase/neon/issues/11598 - if not config.test_may_use_compatibility_snapshot_binaries: - ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode - else: - log.info("ignoring virtual_file_io_mode parametrization for compatibility test") - if self.pageserver_wal_receiver_protocol is not None: key, value = PageserverWalReceiverProtocol.to_config_key_value( self.pageserver_wal_receiver_protocol @@ -1376,7 +1377,11 @@ class NeonEnv: force=config.config_init_force, ) - def start(self, timeout_in_seconds: int | None = None): + def start( + self, + timeout_in_seconds: int | None = None, + extra_ps_env_vars: dict[str, str] | None = None, + ): # Storage controller starts first, so that pageserver /re-attach calls don't # bounce through retries on startup self.storage_controller.start(timeout_in_seconds=timeout_in_seconds) @@ -1395,7 +1400,10 @@ class NeonEnv: for pageserver in self.pageservers: futs.append( executor.submit( - lambda ps=pageserver: ps.start(timeout_in_seconds=timeout_in_seconds) # type: ignore[misc] + lambda ps=pageserver: ps.start( # type: ignore[misc] + extra_env_vars=extra_ps_env_vars or {}, + timeout_in_seconds=timeout_in_seconds, + ), ) ) @@ -1409,30 +1417,6 @@ class NeonEnv: for f in futs: f.result() - # Last step: register safekeepers at the storage controller - if ( - self.storage_controller_config is not None - and self.storage_controller_config.get("timelines_onto_safekeepers") is True - ): - for sk_id, sk in enumerate(self.safekeepers): - # 0 is an invalid safekeeper id - sk_id = sk_id + 1 - body = { - "id": sk_id, - "created_at": "2023-10-25T09:11:25Z", - "updated_at": "2024-08-28T11:32:43Z", - "region_id": "aws-us-east-2", - "host": "127.0.0.1", - "port": sk.port.pg, - "http_port": sk.port.http, - "https_port": None, - "version": 5957, - "availability_zone_id": f"us-east-2b-{sk_id}", - } - - self.storage_controller.on_safekeeper_deploy(sk_id, body) - self.storage_controller.safekeeper_scheduling_policy(sk_id, "Active") - self.endpoint_storage.start(timeout_in_seconds=timeout_in_seconds) def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True): @@ -3636,6 +3620,8 @@ class NeonProxy(PgProtocol): http_port: int, mgmt_port: int, external_http_port: int, + router_port: int, + router_tls_port: int, auth_backend: NeonProxy.AuthBackend, metric_collection_endpoint: str | None = None, metric_collection_interval: str | None = None, @@ -3652,6 +3638,8 @@ class NeonProxy(PgProtocol): self.test_output_dir = test_output_dir self.proxy_port = proxy_port self.mgmt_port = mgmt_port + self.router_port = router_port + self.router_tls_port = router_tls_port self.auth_backend = auth_backend self.metric_collection_endpoint = metric_collection_endpoint self.metric_collection_interval = metric_collection_interval @@ -3666,6 +3654,14 @@ class NeonProxy(PgProtocol): key_path = self.test_output_dir / "proxy.key" generate_proxy_tls_certs("*.local.neon.build", key_path, crt_path) + # generate key for pg-sni-router. + # endpoint.namespace.local.neon.build resolves to 127.0.0.1 + generate_proxy_tls_certs( + "endpoint.namespace.local.neon.build", + self.test_output_dir / "router.key", + self.test_output_dir / "router.crt", + ) + args = [ str(self.neon_binpath / "proxy"), *["--http", f"{self.host}:{self.http_port}"], @@ -3675,6 +3671,11 @@ class NeonProxy(PgProtocol): *["--sql-over-http-timeout", f"{self.http_timeout_seconds}s"], *["-c", str(crt_path)], *["-k", str(key_path)], + *["--sni-router-listen", f"{self.host}:{self.router_port}"], + *["--sni-router-listen-tls", f"{self.host}:{self.router_tls_port}"], + *["--sni-router-tls-cert", str(self.test_output_dir / "router.crt")], + *["--sni-router-tls-key", str(self.test_output_dir / "router.key")], + *["--sni-router-destination", "local.neon.build"], *self.auth_backend.extra_args(), ] @@ -3866,7 +3867,7 @@ class NeonAuthBroker: external_http_port: int, auth_backend: NeonAuthBroker.ProxyV1, ): - self.domain = "apiauth.local.neon.build" # resolves to 127.0.0.1 + self.domain = "local.neon.build" # resolves to 127.0.0.1 self.host = "127.0.0.1" self.http_port = http_port self.external_http_port = external_http_port @@ -3883,7 +3884,7 @@ class NeonAuthBroker: # generate key of it doesn't exist crt_path = self.test_output_dir / "proxy.crt" key_path = self.test_output_dir / "proxy.key" - generate_proxy_tls_certs("apiauth.local.neon.build", key_path, crt_path) + generate_proxy_tls_certs(f"apiauth.{self.domain}", key_path, crt_path) args = [ str(self.neon_binpath / "proxy"), @@ -3927,10 +3928,10 @@ class NeonAuthBroker: log.info(f"Executing http query: {query}") - connstr = f"postgresql://{user}@{self.domain}/postgres" + connstr = f"postgresql://{user}@ep-foo-bar-1234.{self.domain}/postgres" async with httpx.AsyncClient(verify=str(self.test_output_dir / "proxy.crt")) as client: response = await client.post( - f"https://{self.domain}:{self.external_http_port}/sql", + f"https://apiauth.{self.domain}:{self.external_http_port}/sql", json={"query": query, "params": args}, headers={ "Neon-Connection-String": connstr, @@ -3974,6 +3975,8 @@ def link_proxy( proxy_port = port_distributor.get_port() mgmt_port = port_distributor.get_port() external_http_port = port_distributor.get_port() + router_port = port_distributor.get_port() + router_tls_port = port_distributor.get_port() with NeonProxy( neon_binpath=neon_binpath, @@ -3981,6 +3984,8 @@ def link_proxy( proxy_port=proxy_port, http_port=http_port, mgmt_port=mgmt_port, + router_port=router_port, + router_tls_port=router_tls_port, external_http_port=external_http_port, auth_backend=NeonProxy.Link(), ) as proxy: @@ -4014,6 +4019,8 @@ def static_proxy( mgmt_port = port_distributor.get_port() http_port = port_distributor.get_port() external_http_port = port_distributor.get_port() + router_port = port_distributor.get_port() + router_tls_port = port_distributor.get_port() with NeonProxy( neon_binpath=neon_binpath, @@ -4021,6 +4028,8 @@ def static_proxy( proxy_port=proxy_port, http_port=http_port, mgmt_port=mgmt_port, + router_port=router_port, + router_tls_port=router_tls_port, external_http_port=external_http_port, auth_backend=NeonProxy.Postgres(auth_endpoint), ) as proxy: @@ -4226,7 +4235,7 @@ class Endpoint(PgProtocol, LogUtils): def start( self, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, safekeeper_generation: int | None = None, safekeepers: list[int] | None = None, @@ -4252,7 +4261,7 @@ class Endpoint(PgProtocol, LogUtils): self.endpoint_id, safekeepers_generation=safekeeper_generation, safekeepers=self.active_safekeepers, - remote_ext_config=remote_ext_config, + remote_ext_base_url=remote_ext_base_url, pageserver_id=pageserver_id, allow_multiple=allow_multiple, create_test_user=create_test_user, @@ -4467,7 +4476,7 @@ class Endpoint(PgProtocol, LogUtils): hot_standby: bool = False, lsn: Lsn | None = None, config_lines: list[str] | None = None, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, allow_multiple: bool = False, basebackup_request_tries: int | None = None, @@ -4486,7 +4495,7 @@ class Endpoint(PgProtocol, LogUtils): pageserver_id=pageserver_id, allow_multiple=allow_multiple, ).start( - remote_ext_config=remote_ext_config, + remote_ext_base_url=remote_ext_base_url, pageserver_id=pageserver_id, allow_multiple=allow_multiple, basebackup_request_tries=basebackup_request_tries, @@ -4570,7 +4579,7 @@ class EndpointFactory: lsn: Lsn | None = None, hot_standby: bool = False, config_lines: list[str] | None = None, - remote_ext_config: str | None = None, + remote_ext_base_url: str | None = None, pageserver_id: int | None = None, basebackup_request_tries: int | None = None, ) -> Endpoint: @@ -4590,7 +4599,7 @@ class EndpointFactory: hot_standby=hot_standby, config_lines=config_lines, lsn=lsn, - remote_ext_config=remote_ext_config, + remote_ext_base_url=remote_ext_base_url, pageserver_id=pageserver_id, basebackup_request_tries=basebackup_request_tries, ) @@ -4644,7 +4653,10 @@ class EndpointFactory: return self def new_replica( - self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None + self, + origin: Endpoint, + endpoint_id: str | None = None, + config_lines: list[str] | None = None, ): branch_name = origin.branch_name assert origin in self.endpoints @@ -4660,7 +4672,10 @@ class EndpointFactory: ) def new_replica_start( - self, origin: Endpoint, endpoint_id: str, config_lines: list[str] | None = None + self, + origin: Endpoint, + endpoint_id: str | None = None, + config_lines: list[str] | None = None, ): branch_name = origin.branch_name assert origin in self.endpoints @@ -5477,6 +5492,13 @@ def wait_for_last_flush_lsn( if last_flush_lsn is None: last_flush_lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0]) + # The last_flush_lsn may not correspond to a record boundary. + # For example, if the compute flushed WAL on a page boundary, + # the remaining part of the record might not be flushed for a long time. + # This would prevent the pageserver from reaching last_flush_lsn promptly. + # To ensure the rest of the record reaches the pageserver quickly, + # we forcibly flush the WAL by using CHECKPOINT. + endpoint.safe_psql("CHECKPOINT") results = [] for tenant_shard_id, pageserver in shards: diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py index 24c856e279..9b564f0a60 100755 --- a/test_runner/fixtures/pageserver/allowed_errors.py +++ b/test_runner/fixtures/pageserver/allowed_errors.py @@ -111,6 +111,13 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = ( ".*stalling layer flushes for compaction backpressure.*", ".*layer roll waiting for flush due to compaction backpressure.*", ".*BatchSpanProcessor.*", + *( + [ + r".*your platform is not a supported production platform, ignoing request for O_DIRECT; this could hide alignment bugs.*" + ] + if sys.platform != "linux" + else [] + ), ) @@ -122,6 +129,10 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [ ".*Call to node.*management API.*failed.*Timeout.*", ".*Failed to update node .+ after heartbeat round.*error sending request for url.*", ".*background_reconcile: failed to fetch top tenants:.*client error \\(Connect\\).*", + # Many tests will take safekeepers offline + ".*Call to safekeeper.*management API.*failed.*receive body.*", + ".*Call to safekeeper.*management API.*failed.*ReceiveBody.*", + ".*Call to safekeeper.*management API.*failed.*Timeout.*", # Many tests will start up with a node offline ".*startup_reconcile: Could not scan node.*", # Tests run in dev mode diff --git a/test_runner/fixtures/pageserver/makelayers/__init__.py b/test_runner/fixtures/pageserver/makelayers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/fixtures/pageserver/makelayers/l0stack.py b/test_runner/fixtures/pageserver/makelayers/l0stack.py new file mode 100644 index 0000000000..408ba1254f --- /dev/null +++ b/test_runner/fixtures/pageserver/makelayers/l0stack.py @@ -0,0 +1,148 @@ +from dataclasses import dataclass + +from psycopg2.extensions import connection as PgConnection + +from fixtures.common_types import Lsn, TenantShardId, TimelineId +from fixtures.log_helper import log +from fixtures.neon_fixtures import Endpoint +from fixtures.pageserver.http import PageserverHttpClient +from fixtures.pageserver.utils import wait_for_last_record_lsn + + +@dataclass +class L0StackShape: + logical_table_size_mib: int = 50 + delta_stack_height: int = 20 + + +def make_l0_stack(endpoint: Endpoint, shape: L0StackShape): + """ + Creates stack of L0 deltas each of which should have 1 Value::Delta per page in table `data`. + """ + env = endpoint.env + + # TDOO: wait for storcon to finish any reonciles before jumping to action here? + description = env.storage_controller.tenant_describe(endpoint.tenant_id) + shards = description["shards"] + assert len(shards) == 1, "does not support sharding" + tenant_shard_id = TenantShardId.parse(shards[0]["tenant_shard_id"]) + + endpoint.config(["full_page_writes=off"]) + endpoint.reconfigure() + + ps = env.get_pageserver(shards[0]["node_attached"]) + + timeline_id = endpoint.show_timeline_id() + + vps_http = env.storage_controller.pageserver_api() + ps_http = ps.http_client() + endpoint_conn = endpoint.connect() + make_l0_stack_standalone(vps_http, ps_http, tenant_shard_id, timeline_id, endpoint_conn, shape) + + +def make_l0_stack_standalone( + vps_http: PageserverHttpClient, + ps_http: PageserverHttpClient, + tenant_shard_id: TenantShardId, + timeline_id: TimelineId, + endpoint_conn: PgConnection, + shape: L0StackShape, +): + """ + See make_l0_stack for details. + + This function is a standalone version of make_l0_stack, usable from not-test code. + """ + + assert not tenant_shard_id.shard_index.is_sharded, ( + "the current implementation only supports unsharded tenants" + ) + + tenant_id = tenant_shard_id.tenant_id + conn = endpoint_conn + desired_size = shape.logical_table_size_mib * 1024 * 1024 + + config = { + "gc_period": "0s", # disable periodic gc + "checkpoint_timeout": "10 years", + "compaction_period": "1h", # doesn't matter, but 0 value will kill walredo every 10s + "compaction_threshold": 100000, # we just want L0s + "compaction_target_size": 134217728, + "checkpoint_distance": 268435456, + "image_creation_threshold": 100000, # we just want L0s + } + + vps_http.set_tenant_config(tenant_id, config) + + conn.autocommit = True + cur = conn.cursor() + + # Ensure full_page_writes are disabled so that all Value::Delta in + # pageserver are !will_init, and therefore a getpage needs to read + # the entire delta stack. + cur.execute("SHOW full_page_writes") + assert cur.fetchall()[0][0] == "off", "full_page_writes should be off" + + # each tuple is 23 (header) + 100 bytes = 123 bytes + # page header si 24 bytes + # 8k page size + # (8k-24bytes) / 123 bytes = 63 tuples per page + # set fillfactor to 10 to have 6 tuples per page + cur.execute("DROP TABLE IF EXISTS data") + cur.execute("CREATE TABLE data(id bigint, row char(92)) with (fillfactor=10)") + need_pages = desired_size // 8192 + need_rows = need_pages * 6 + log.info(f"Need {need_pages} pages, {need_rows} rows") + cur.execute(f"INSERT INTO data SELECT i,'row'||i FROM generate_series(1, {need_rows}) as i") + # Raise fillfactor to 100% so that all updates are HOT updates. + # We assert they're hot updates by checking fetch_id_to_page_mapping remains the same. + cur.execute("ALTER TABLE data SET (fillfactor=100)") + + def settle_and_flush(): + cur.execute("SELECT pg_current_wal_flush_lsn()") + flush_lsn = Lsn(cur.fetchall()[0][0]) + wait_for_last_record_lsn(ps_http, tenant_shard_id, timeline_id, flush_lsn) + ps_http.timeline_checkpoint(tenant_id, timeline_id) + + # create an L0 for the initial data we just inserted + settle_and_flush() + + # assert we wrote what we think we wrote + cur.execute(""" + with ntuples_per_page as ( + select (ctid::text::point)[0]::bigint pageno,count(*) ntuples from data group by pageno + ) + select ntuples, count(*) npages from ntuples_per_page group by ntuples order by ntuples; + """) + rows = cur.fetchall() + log.info(f"initial table layout: {rows}") + assert len(rows) == 1 + assert rows[0][0] == 6, f"expected 6 tuples per page, got {rows[0][0]}" + assert rows[0][1] == need_pages, f"expected {need_pages} pages, got {rows[0][1]}" + + def fetch_id_to_page_mapping(): + cur.execute(""" + SELECT id,(ctid::text::point)[0]::bigint pageno FROM data ORDER BY id + """) + return cur.fetchall() + + initial_mapping = fetch_id_to_page_mapping() + + # every iteration updates one tuple in each page + delta_stack_height = shape.delta_stack_height + for i in range(0, delta_stack_height): + log.info(i) + cur.execute(f"UPDATE data set row = row||',u' where id % 6 = {i % 6}") + log.info(f"modified rows: {cur.rowcount}") + assert cur.rowcount == need_pages + settle_and_flush() + post_update_mapping = fetch_id_to_page_mapping() + assert initial_mapping == post_update_mapping, "Postgres should be doing HOT updates" + + # Assert the layer count is what we expect it is + layer_map = vps_http.layer_map_info(tenant_id, timeline_id) + assert ( + len(layer_map.delta_l0_layers()) == delta_stack_height + 1 + 1 + ) # +1 for the initdb layer + 1 for the table creation & fill + assert len(layer_map.delta_l0_layers()) == len(layer_map.delta_layers()) # it's all L0s + assert len(layer_map.image_layers()) == 0 # no images diff --git a/test_runner/performance/README.md b/test_runner/performance/README.md index 3b25a60e9b..21844648d1 100644 --- a/test_runner/performance/README.md +++ b/test_runner/performance/README.md @@ -15,7 +15,8 @@ Some handy pytest flags for local development: - `-k` selects a test to run - `--timeout=0` disables our default timeout of 300s (see `setup.cfg`) - `--preserve-database-files` to skip cleanup -- `--out-dir` to produce a JSON with the recorded test metrics +- `--out-dir` to produce a JSON with the recorded test metrics. + There is a post-processing tool at `test_runner/performance/out_dir_to_csv.py`. # What performance tests do we have and how we run them diff --git a/test_runner/performance/out_dir_to_csv.py b/test_runner/performance/out_dir_to_csv.py new file mode 100644 index 0000000000..8647ad4acc --- /dev/null +++ b/test_runner/performance/out_dir_to_csv.py @@ -0,0 +1,57 @@ +# Tool to convert the JSON output from running a perf test with `--out-dir` to a CSV that +# can be easily pasted into a spreadsheet for quick viz & analysis. +# Check the `./README.md` in this directory for `--out-dir`. +# +# TODO: add the pytest.mark.parametrize to the json and make them columns here +# https://github.com/neondatabase/neon/issues/11878 + +import csv +import json +import os +import sys + + +def json_to_csv(json_file): + with open(json_file) as f: + data = json.load(f) + + # Collect all possible metric names to form headers + all_metrics = set() + for result in data.get("result", []): + for metric in result.get("data", []): + all_metrics.add(metric["name"]) + + # Sort metrics for consistent output + metrics = sorted(list(all_metrics)) + + # Create headers + headers = ["suit"] + metrics + + # Prepare rows + rows = [] + for result in data.get("result", []): + row = {"suit": result["suit"]} + + # Initialize all metrics to empty + for metric in metrics: + row[metric] = "" + + # Fill in available metrics + for item in result.get("data", []): + row[item["name"]] = item["value"] + + rows.append(row) + + # Write to stdout as CSV + writer = csv.DictWriter(sys.stdout, fieldnames=headers) + writer.writeheader() + writer.writerows(rows) + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print(f"Usage: python {os.path.basename(__file__)} ") + sys.exit(1) + + json_file = sys.argv[1] + json_to_csv(json_file) diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py index b17ca772c9..9e2312311a 100644 --- a/test_runner/performance/pageserver/test_page_service_batching.py +++ b/test_runner/performance/pageserver/test_page_service_batching.py @@ -10,7 +10,8 @@ from typing import Any import pytest from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, PgBin +from fixtures.pageserver.makelayers import l0stack from fixtures.utils import humantime_to_ms TARGET_RUNTIME = 30 @@ -34,28 +35,18 @@ class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig): mode: str = "pipelined" -EXECUTION = ["concurrent-futures"] -BATCHING = ["uniform-lsn", "scattered-lsn"] - -NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] -for max_batch_size in [1, 32]: - for execution in EXECUTION: - for batching in BATCHING: - NON_BATCHABLE.append( - PageServicePipeliningConfigPipelined(max_batch_size, execution, batching) - ) - -BATCHABLE: list[PageServicePipeliningConfig] = [] +PS_IO_CONCURRENCY = ["sidecar-task"] +PIPELINING_CONFIGS: list[PageServicePipeliningConfig] = [] for max_batch_size in [32]: - for execution in EXECUTION: - for batching in BATCHING: - BATCHABLE.append( + for execution in ["concurrent-futures"]: + for batching in ["scattered-lsn"]: + PIPELINING_CONFIGS.append( PageServicePipeliningConfigPipelined(max_batch_size, execution, batching) ) @pytest.mark.parametrize( - "tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name", + "tablesize_mib, pipelining_config, target_runtime, ps_io_concurrency, effective_io_concurrency, readhead_buffer_size, name", [ # batchable workloads should show throughput and CPU efficiency improvements *[ @@ -63,20 +54,23 @@ for max_batch_size in [32]: 50, config, TARGET_RUNTIME, + ps_io_concurrency, 100, 128, f"batchable {dataclasses.asdict(config)}", ) - for config in BATCHABLE + for config in PIPELINING_CONFIGS + for ps_io_concurrency in PS_IO_CONCURRENCY ], ], ) -def test_throughput( +def test_postgres_seqscan( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, tablesize_mib: int, pipelining_config: PageServicePipeliningConfig, target_runtime: int, + ps_io_concurrency: str, effective_io_concurrency: int, readhead_buffer_size: int, name: str, @@ -97,6 +91,10 @@ def test_throughput( If the compute provides pipeline depth (effective_io_concurrency=100), then pipelining configs, especially with max_batch_size>1 should yield dramatic improvements in all performance metrics. + + We advance the LSN from a disruptor thread to simulate the effect of a workload with concurrent writes + in another table. The `scattered-lsn` batching mode handles this well whereas the + initial implementatin (`uniform-lsn`) would break the batch. """ # @@ -114,7 +112,19 @@ def test_throughput( } ) # For storing configuration as a metric, insert a fake 0 with labels with actual data - params.update({"pipelining_config": (0, {"labels": dataclasses.asdict(pipelining_config)})}) + params.update( + { + "config": ( + 0, + { + "labels": { + "pipelining_config": dataclasses.asdict(pipelining_config), + "ps_io_concurrency": ps_io_concurrency, + } + }, + ) + } + ) log.info("params: %s", params) @@ -266,7 +276,10 @@ def test_throughput( return iters env.pageserver.patch_config_toml_nonrecursive( - {"page_service_pipelining": dataclasses.asdict(pipelining_config)} + { + "page_service_pipelining": dataclasses.asdict(pipelining_config), + "get_vectored_concurrent_io": {"mode": ps_io_concurrency}, + } ) # set trace for log analysis below @@ -318,77 +331,63 @@ def test_throughput( ) -PRECISION_CONFIGS: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()] -for max_batch_size in [1, 32]: - for execution in EXECUTION: - for batching in BATCHING: - PRECISION_CONFIGS.append( - PageServicePipeliningConfigPipelined(max_batch_size, execution, batching) - ) - - @pytest.mark.parametrize( - "pipelining_config,name", - [(config, f"{dataclasses.asdict(config)}") for config in PRECISION_CONFIGS], + "pipelining_config,ps_io_concurrency,l0_stack_height,queue_depth,name", + [ + (config, ps_io_concurrency, l0_stack_height, queue_depth, f"{dataclasses.asdict(config)}") + for config in PIPELINING_CONFIGS + for ps_io_concurrency in PS_IO_CONCURRENCY + for queue_depth in [1, 2, 32] + for l0_stack_height in [0, 20] + ], ) -def test_latency( +def test_random_reads( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, pg_bin: PgBin, pipelining_config: PageServicePipeliningConfig, + ps_io_concurrency: str, + l0_stack_height: int, + queue_depth: int, name: str, ): """ - Measure the latency impact of pipelining in an un-batchable workloads. - - An ideal implementation should not increase average or tail latencies for such workloads. - - We don't have support in pagebench to create queue depth yet. - => https://github.com/neondatabase/neon/issues/9837 + Throw pagebench random getpage at latest lsn workload from a single client against pageserver. """ # # Setup # + def build_snapshot_cb(neon_env_builder: NeonEnvBuilder) -> NeonEnv: + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start("main") + l0stack.make_l0_stack( + endpoint, + l0stack.L0StackShape(logical_table_size_mib=50, delta_stack_height=l0_stack_height), + ) + return env + + env = neon_env_builder.build_and_use_snapshot( + f"test_page_service_batching--test_pagebench-{l0_stack_height}", build_snapshot_cb + ) + def patch_ps_config(ps_config): - if pipelining_config is not None: - ps_config["page_service_pipelining"] = dataclasses.asdict(pipelining_config) + ps_config["page_service_pipelining"] = dataclasses.asdict(pipelining_config) + ps_config["get_vectored_concurrent_io"] = {"mode": ps_io_concurrency} - neon_env_builder.pageserver_config_override = patch_ps_config + env.pageserver.edit_config_toml(patch_ps_config) - env = neon_env_builder.init_start() - endpoint = env.endpoints.create_start("main") - conn = endpoint.connect() - cur = conn.cursor() + env.start() - cur.execute("SET max_parallel_workers_per_gather=0") # disable parallel backends - cur.execute("SET effective_io_concurrency=1") - - cur.execute("CREATE EXTENSION IF NOT EXISTS neon;") - cur.execute("CREATE EXTENSION IF NOT EXISTS neon_test_utils;") - - log.info("Filling the table") - cur.execute("CREATE TABLE t (data char(1000)) with (fillfactor=10)") - tablesize = 50 * 1024 * 1024 - npages = tablesize // (8 * 1024) - cur.execute("INSERT INTO t SELECT generate_series(1, %s)", (npages,)) - # TODO: can we force postgres to do sequential scans? - - cur.close() - conn.close() - - wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline) - - endpoint.stop() + lsn = env.safekeepers[0].get_commit_lsn(env.initial_tenant, env.initial_timeline) + ep = env.endpoints.create_start("main", lsn=lsn) + data_table_relnode_oid = ep.safe_psql_scalar("SELECT 'data'::regclass::oid") + ep.stop_and_destroy() for sk in env.safekeepers: sk.stop() - # - # Run single-threaded pagebench (TODO: dedup with other benchmark code) - # - env.pageserver.allowed_errors.append( # https://github.com/neondatabase/neon/issues/6925 r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*" @@ -396,6 +395,8 @@ def test_latency( ps_http = env.pageserver.http_client() + metrics_before = ps_http.get_metrics() + cmd = [ str(env.neon_binpath / "pagebench"), "get-page-latest-lsn", @@ -405,6 +406,10 @@ def test_latency( env.pageserver.connstr(password=None), "--num-clients", "1", + "--queue-depth", + str(queue_depth), + "--only-relnode", + str(data_table_relnode_oid), "--runtime", "10s", ] @@ -413,12 +418,22 @@ def test_latency( results_path = Path(basepath + ".stdout") log.info(f"Benchmark results at: {results_path}") + metrics_after = ps_http.get_metrics() + with open(results_path) as f: results = json.load(f) log.info(f"Results:\n{json.dumps(results, sort_keys=True, indent=2)}") total = results["total"] + metric = "request_count" + zenbenchmark.record( + metric, + metric_value=total[metric], + unit="", + report=MetricReport.HIGHER_IS_BETTER, + ) + metric = "latency_mean" zenbenchmark.record( metric, @@ -435,3 +450,17 @@ def test_latency( unit="ms", report=MetricReport.LOWER_IS_BETTER, ) + + reads_before = metrics_before.query_one( + "pageserver_io_operations_seconds_count", filter={"operation": "read"} + ) + reads_after = metrics_after.query_one( + "pageserver_io_operations_seconds_count", filter={"operation": "read"} + ) + + zenbenchmark.record( + "virtual_file_reads", + metric_value=reads_after.value - reads_before.value, + unit="", + report=MetricReport.LOWER_IS_BETTER, + ) diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index d28240c722..24ba0713d2 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -221,7 +221,7 @@ def test_remote_extensions( endpoint.create_remote_extension_spec(spec) - endpoint.start(remote_ext_config=extensions_endpoint) + endpoint.start(remote_ext_base_url=extensions_endpoint) with endpoint.connect() as conn: with conn.cursor() as cur: @@ -249,7 +249,7 @@ def test_remote_extensions( # Remove the extension files to force a redownload of the extension. extension.remove(test_output_dir, pg_version) - endpoint.start(remote_ext_config=extensions_endpoint) + endpoint.start(remote_ext_base_url=extensions_endpoint) # Test that ALTER EXTENSION UPDATE statements also fetch remote extensions. with endpoint.connect() as conn: diff --git a/test_runner/regress/test_import_pgdata.py b/test_runner/regress/test_import_pgdata.py index a26c3994a5..0472b92145 100644 --- a/test_runner/regress/test_import_pgdata.py +++ b/test_runner/regress/test_import_pgdata.py @@ -24,6 +24,7 @@ from fixtures.utils import ( skip_in_debug_build, wait_until, ) +from fixtures.workload import Workload from mypy_boto3_kms import KMSClient from mypy_boto3_kms.type_defs import EncryptResponseTypeDef from mypy_boto3_s3 import S3Client @@ -97,6 +98,10 @@ def test_pgdata_import_smoke( f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/storage/api/v1/" ) + if neon_env_builder.storage_controller_config is None: + neon_env_builder.storage_controller_config = {} + neon_env_builder.storage_controller_config["timelines_onto_safekeepers"] = True + env = neon_env_builder.init_start() # The test needs LocalFs support, which is only built in testing mode. @@ -125,9 +130,8 @@ def test_pgdata_import_smoke( elif rel_block_size == RelBlockSize.TWO_STRPES_PER_SHARD: target_relblock_size = (shard_count or 1) * stripe_size * 8192 * 2 elif rel_block_size == RelBlockSize.MULTIPLE_RELATION_SEGMENTS: - # Postgres uses a 1GiB segment size, fixed at compile time, so we must use >2GB of data - # to exercise multiple segments. - target_relblock_size = int(((2.333 * 1024 * 1024 * 1024) // 8192) * 8192) + segment_size = 16 * 1024 * 1024 + target_relblock_size = segment_size * 8 else: raise ValueError @@ -286,34 +290,28 @@ def test_pgdata_import_smoke( # # validate that we can write # - rw_endpoint = env.endpoints.create_start( - branch_name=import_branch_name, - endpoint_id="rw", - tenant_id=tenant_id, - config_lines=ep_config, - ) - rw_endpoint.safe_psql("create table othertable(values text)") - rw_lsn = Lsn(rw_endpoint.safe_psql_scalar("select pg_current_wal_flush_lsn()")) + workload = Workload(env, tenant_id, timeline_id, branch_name=import_branch_name) + workload.init() + workload.write_rows(64) + workload.validate() - # TODO: consider using `class Workload` here - # to do compaction and whatnot? + rw_lsn = Lsn(workload.endpoint().safe_psql_scalar("select pg_current_wal_flush_lsn()")) # # validate that we can branch (important use case) # # ... at the tip - _ = env.create_branch( + child_timeline_id = env.create_branch( new_branch_name="br-tip", ancestor_branch_name=import_branch_name, tenant_id=tenant_id, ancestor_start_lsn=rw_lsn, ) - br_tip_endpoint = env.endpoints.create_start( - branch_name="br-tip", endpoint_id="br-tip-ro", tenant_id=tenant_id, config_lines=ep_config - ) - validate_vanilla_equivalence(br_tip_endpoint) - br_tip_endpoint.safe_psql("select * from othertable") + child_workload = workload.branch(timeline_id=child_timeline_id, branch_name="br-tip") + child_workload.validate() + + validate_vanilla_equivalence(child_workload.endpoint()) # ... at the initdb lsn _ = env.create_branch( @@ -330,7 +328,7 @@ def test_pgdata_import_smoke( ) validate_vanilla_equivalence(br_initdb_endpoint) with pytest.raises(psycopg2.errors.UndefinedTable): - br_initdb_endpoint.safe_psql("select * from othertable") + br_initdb_endpoint.safe_psql(f"select * from {workload.table}") @run_only_on_default_postgres(reason="PG version is irrelevant here") @@ -414,6 +412,88 @@ def test_import_completion_on_restart( wait_until(cplane_notified) +@run_only_on_default_postgres(reason="PG version is irrelevant here") +def test_import_respects_tenant_shutdown( + neon_env_builder: NeonEnvBuilder, vanilla_pg: VanillaPostgres, make_httpserver: HTTPServer +): + """ + Validate that importing timelines respect the usual timeline life cycle: + 1. Shut down on tenant shut-down and resumes upon re-attach + 2. Deletion on timeline deletion (TODO) + """ + # Set up mock control plane HTTP server to listen for import completions + import_completion_signaled = Event() + + def handler(request: Request) -> Response: + log.info(f"control plane /import_complete request: {request.json}") + import_completion_signaled.set() + return Response(json.dumps({}), status=200) + + cplane_mgmt_api_server = make_httpserver + cplane_mgmt_api_server.expect_request( + "/storage/api/v1/import_complete", method="PUT" + ).respond_with_handler(handler) + + # Plug the cplane mock in + neon_env_builder.control_plane_hooks_api = ( + f"http://{cplane_mgmt_api_server.host}:{cplane_mgmt_api_server.port}/storage/api/v1/" + ) + + # The import will specifiy a local filesystem path mocking remote storage + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) + + vanilla_pg.start() + vanilla_pg.stop() + + env = neon_env_builder.init_configs() + env.start() + + importbucket_path = neon_env_builder.repo_dir / "test_import_completion_bucket" + mock_import_bucket(vanilla_pg, importbucket_path) + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + idempotency = ImportPgdataIdemptencyKey.random() + + # Pause before sending the notification + failpoint_name = "import-timeline-pre-execute-pausable" + env.pageserver.http_client().configure_failpoints((failpoint_name, "pause")) + + env.storage_controller.tenant_create(tenant_id) + env.storage_controller.timeline_create( + tenant_id, + { + "new_timeline_id": str(timeline_id), + "import_pgdata": { + "idempotency_key": str(idempotency), + "location": {"LocalFs": {"path": str(importbucket_path.absolute())}}, + }, + }, + ) + + def hit_failpoint(): + log.info("Checking log for pattern...") + try: + assert env.pageserver.log_contains(f".*at failpoint {failpoint_name}.*") + except Exception: + log.exception("Failed to find pattern in log") + raise + + wait_until(hit_failpoint) + assert not import_completion_signaled.is_set() + + # Restart the pageserver while an import job is in progress. + # This clears the failpoint and we expect that the import starts up afresh + # after the restart and eventually completes. + env.pageserver.stop() + env.pageserver.start() + + def cplane_notified(): + assert import_completion_signaled.is_set() + + wait_until(cplane_notified) + + def test_fast_import_with_pageserver_ingest( test_output_dir, vanilla_pg: VanillaPostgres, @@ -521,7 +601,9 @@ def test_fast_import_with_pageserver_ingest( env.neon_cli.mappings_map_branch(import_branch_name, tenant_id, timeline_id) # Run fast_import - fast_import.set_aws_creds(mock_s3_server, {"RUST_LOG": "aws_config=debug,aws_sdk_kms=debug"}) + fast_import.set_aws_creds( + mock_s3_server, {"RUST_LOG": "info,aws_config=debug,aws_sdk_kms=debug"} + ) pg_port = port_distributor.get_port() fast_import.run_pgdata(pg_port=pg_port, s3prefix=f"s3://{bucket}/{key_prefix}") @@ -641,6 +723,55 @@ def test_fast_import_binary( assert res[0][0] == 10 +def test_fast_import_event_triggers( + test_output_dir, + vanilla_pg: VanillaPostgres, + port_distributor: PortDistributor, + fast_import: FastImport, +): + vanilla_pg.start() + vanilla_pg.safe_psql(""" + CREATE FUNCTION test_event_trigger_for_drops() + RETURNS event_trigger LANGUAGE plpgsql AS $$ + DECLARE + obj record; + BEGIN + FOR obj IN SELECT * FROM pg_event_trigger_dropped_objects() + LOOP + RAISE NOTICE '% dropped object: % %.% %', + tg_tag, + obj.object_type, + obj.schema_name, + obj.object_name, + obj.object_identity; + END LOOP; + END + $$; + + CREATE EVENT TRIGGER test_event_trigger_for_drops + ON sql_drop + EXECUTE PROCEDURE test_event_trigger_for_drops(); + """) + + pg_port = port_distributor.get_port() + p = fast_import.run_pgdata(pg_port=pg_port, source_connection_string=vanilla_pg.connstr()) + assert p.returncode == 0 + + vanilla_pg.stop() + + pgbin = PgBin(test_output_dir, fast_import.pg_distrib_dir, fast_import.pg_version) + with VanillaPostgres( + fast_import.workdir / "pgdata", pgbin, pg_port, False + ) as new_pgdata_vanilla_pg: + new_pgdata_vanilla_pg.start() + + # database name and user are hardcoded in fast_import binary, and they are different from normal vanilla postgres + conn = PgProtocol(dsn=f"postgresql://cloud_admin@localhost:{pg_port}/neondb") + res = conn.safe_psql("SELECT count(*) FROM pg_event_trigger;") + log.info(f"Result: {res}") + assert res[0][0] == 0, f"Neon does not support importing event triggers, got: {res[0][0]}" + + def test_fast_import_restore_to_connstring( test_output_dir, vanilla_pg: VanillaPostgres, diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 3aa0c63979..f2523ec9b5 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -510,7 +510,7 @@ def list_elegible_layers( except KeyError: # Unexpected: tests should call this when pageservers are in a quiet state such that the layer map # matches what's on disk. - log.warn(f"Lookup {layer_file_name} from {list(visible_map.keys())}") + log.warning(f"Lookup {layer_file_name} from {list(visible_map.keys())}") raise return list(c for c in candidates if is_visible(c)) @@ -636,7 +636,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): except: # On assertion failures, log some details to help with debugging heatmap = env.pageserver_remote_storage.heatmap_content(tenant_id) - log.warn(f"heatmap contents: {json.dumps(heatmap, indent=2)}") + log.warning(f"heatmap contents: {json.dumps(heatmap, indent=2)}") raise # Scrub the remote storage diff --git a/test_runner/regress/test_proxy_metric_collection.py b/test_runner/regress/test_proxy_metric_collection.py index 85d8a6daaa..7442d50f68 100644 --- a/test_runner/regress/test_proxy_metric_collection.py +++ b/test_runner/regress/test_proxy_metric_collection.py @@ -52,6 +52,8 @@ def proxy_with_metric_collector( proxy_port = port_distributor.get_port() mgmt_port = port_distributor.get_port() external_http_port = port_distributor.get_port() + router_port = port_distributor.get_port() + router_tls_port = port_distributor.get_port() (host, port) = httpserver_listen_address metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events" @@ -63,6 +65,8 @@ def proxy_with_metric_collector( proxy_port=proxy_port, http_port=http_port, mgmt_port=mgmt_port, + router_port=router_port, + router_tls_port=router_tls_port, external_http_port=external_http_port, metric_collection_endpoint=metric_collection_endpoint, metric_collection_interval=metric_collection_interval, diff --git a/test_runner/regress/test_role_grants.py b/test_runner/regress/test_role_grants.py index b2251875f0..5b13d461f0 100644 --- a/test_runner/regress/test_role_grants.py +++ b/test_runner/regress/test_role_grants.py @@ -39,3 +39,10 @@ def test_role_grants(neon_simple_env: NeonEnv): res = cur.fetchall() assert res == [(1,)], "select should not succeed" + + # confirm that replicas can also ensure the grants are correctly set. + replica = env.endpoints.new_replica_start(endpoint) + replica_client = replica.http_client() + replica_client.set_role_grants( + "test_role_grants", "test_role", "test_schema", ["CREATE", "USAGE"] + ) diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py index 19952fc71b..61893f22ba 100644 --- a/test_runner/regress/test_sni_router.py +++ b/test_runner/regress/test_sni_router.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING import backoff from fixtures.log_helper import log -from fixtures.neon_fixtures import PgProtocol, VanillaPostgres +from fixtures.neon_fixtures import NeonProxy, PgProtocol, VanillaPostgres if TYPE_CHECKING: from pathlib import Path @@ -41,6 +41,7 @@ class PgSniRouter(PgProtocol): self, neon_binpath: Path, port: int, + tls_port: int, destination: str, tls_cert: Path, tls_key: Path, @@ -53,6 +54,7 @@ class PgSniRouter(PgProtocol): self.host = host self.neon_binpath = neon_binpath self.port = port + self.tls_port = tls_port self.destination = destination self.tls_cert = tls_cert self.tls_key = tls_key @@ -64,6 +66,7 @@ class PgSniRouter(PgProtocol): args = [ str(self.neon_binpath / "pg_sni_router"), *["--listen", f"127.0.0.1:{self.port}"], + *["--listen-tls", f"127.0.0.1:{self.tls_port}"], *["--tls-cert", str(self.tls_cert)], *["--tls-key", str(self.tls_key)], *["--destination", self.destination], @@ -127,10 +130,12 @@ def test_pg_sni_router( pg_port = vanilla_pg.default_options["port"] router_port = port_distributor.get_port() + router_tls_port = port_distributor.get_port() with PgSniRouter( neon_binpath=neon_binpath, port=router_port, + tls_port=router_tls_port, destination="local.neon.build", tls_cert=test_output_dir / "router.crt", tls_key=test_output_dir / "router.key", @@ -146,3 +151,22 @@ def test_pg_sni_router( hostaddr="127.0.0.1", ) assert out[0][0] == 1 + + +def test_pg_sni_router_in_proxy( + static_proxy: NeonProxy, + vanilla_pg: VanillaPostgres, +): + # static_proxy starts this. + assert vanilla_pg.is_running() + pg_port = vanilla_pg.default_options["port"] + + out = static_proxy.safe_psql( + "select 1", + dbname="postgres", + sslmode="require", + host=f"endpoint--namespace--{pg_port}.local.neon.build", + hostaddr="127.0.0.1", + port=static_proxy.router_port, + ) + assert out[0][0] == 1 diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index a71652af8a..d42c5d403e 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -1822,7 +1822,7 @@ def test_timeline_detach_with_aux_files_with_detach_v1( endpoint2.safe_psql( "SELECT pg_create_logical_replication_slot('test_slot_restore', 'pgoutput')" ) - lsn3 = wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id) + lsn3 = wait_for_last_flush_lsn(env, endpoint2, env.initial_tenant, branch_timeline_id) assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn1).keys()) == set([]) assert set(http.list_aux_files(env.initial_tenant, branch_timeline_id, lsn3).keys()) == set( ["pg_replslot/test_slot_restore/state"] @@ -1839,7 +1839,7 @@ def test_timeline_detach_with_aux_files_with_detach_v1( assert all_reparented == set([]) # We need to ensure all safekeeper data are ingested before checking aux files: the API does not wait for LSN. - wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, branch_timeline_id) + wait_for_last_flush_lsn(env, endpoint2, env.initial_tenant, branch_timeline_id) assert set(http.list_aux_files(env.initial_tenant, env.initial_timeline, lsn2).keys()) == set( ["pg_replslot/test_slot_parent_1/state", "pg_replslot/test_slot_parent_2/state"] ), "main branch unaffected" diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index c8dab02bfc..55c0d45abe 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit c8dab02bfc003ae7bd59096919042d7840f3c194 +Subproject commit 55c0d45abe6467c02084c2192bca117eda6ce1e7 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index b838c8969b..de7640f55d 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit b838c8969b7c63f3e637a769656f5f36793b797c +Subproject commit de7640f55da07512834d5cc40c4b3fb376b5f04f diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index 05ddf212e2..0bf96bd6d7 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit 05ddf212e2e07b788b5c8b88bdcf98630941f6ae +Subproject commit 0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198 diff --git a/vendor/postgres-v17 b/vendor/postgres-v17 index eab3a37834..8be779fd3a 160000 --- a/vendor/postgres-v17 +++ b/vendor/postgres-v17 @@ -1 +1 @@ -Subproject commit eab3a37834cac6ec0719bf817ac918a201712d66 +Subproject commit 8be779fd3ab9e87206da96a7e4842ef1abf04f44 diff --git a/vendor/revisions.json b/vendor/revisions.json index 74a6ff33d7..3e999760f4 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,18 +1,18 @@ { "v17": [ - "17.4", - "eab3a37834cac6ec0719bf817ac918a201712d66" + "17.5", + "8be779fd3ab9e87206da96a7e4842ef1abf04f44" ], "v16": [ - "16.8", - "05ddf212e2e07b788b5c8b88bdcf98630941f6ae" + "16.9", + "0bf96bd6d70301a0b43b0b3457bb3cf8fb43c198" ], "v15": [ - "15.12", - "b838c8969b7c63f3e637a769656f5f36793b797c" + "15.13", + "de7640f55da07512834d5cc40c4b3fb376b5f04f" ], "v14": [ - "14.17", - "c8dab02bfc003ae7bd59096919042d7840f3c194" + "14.18", + "55c0d45abe6467c02084c2192bca117eda6ce1e7" ] } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index f3d8b951a8..69d44b82ea 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -39,8 +39,10 @@ env_logger = { version = "0.11" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } form_urlencoded = { version = "1" } futures-channel = { version = "0.3", features = ["sink"] } +futures-core = { version = "0.3" } futures-executor = { version = "0.3" } futures-io = { version = "0.3" } +futures-task = { version = "0.3", default-features = false, features = ["std"] } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] } getrandom = { version = "0.2", default-features = false, features = ["std"] } @@ -58,7 +60,8 @@ lazy_static = { version = "1", default-features = false, features = ["spin_no_st libc = { version = "0.2", features = ["extra_traits", "use_std"] } log = { version = "0.4", default-features = false, features = ["std"] } memchr = { version = "2" } -nix = { version = "0.26" } +nix-2f80eeee3b1b6c7e = { package = "nix", version = "0.26" } +nix-fa1f6196edfd7249 = { package = "nix", version = "0.30", features = ["dir", "ioctl", "mman", "poll", "signal", "socket"] } nom = { version = "7" } num = { version = "0.4" } num-bigint = { version = "0.4" } @@ -70,6 +73,7 @@ num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } p256 = { version = "0.13", features = ["jwk"] } parquet = { version = "53", default-features = false, features = ["zstd"] } +percent-encoding = { version = "2" } prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] } rand = { version = "0.8", features = ["small_rng"] } regex = { version = "1" }