diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 5ccfe48684..69b8bc5d70 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -671,6 +671,10 @@ jobs: password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} options: --init + # Increase timeout to 12h, default timeout is 6h + # we have regression in clickbench causing it to run 2-3x longer + timeout-minutes: 720 + steps: - uses: actions/checkout@v4 @@ -716,7 +720,7 @@ jobs: test_selection: performance/test_perf_olap.py run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} - extra_params: -m remote_cluster --timeout 21600 -k test_clickbench + extra_params: -m remote_cluster --timeout 43200 -k test_clickbench pg_version: ${{ env.DEFAULT_PG_VERSION }} env: VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 0d3ea7db28..bba51ddc92 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -839,6 +839,7 @@ jobs: - name: Build vm image run: | ./vm-builder \ + -size=2G \ -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \ -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \ -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} @@ -1116,7 +1117,10 @@ jobs: gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \ -f deployPgSniRouter=true \ - -f deployProxy=true \ + -f deployProxyLink=true \ + -f deployPrivatelinkProxy=true \ + -f deployProxyScram=true \ + -f deployProxyAuthBroker=true \ -f branch=main \ -f dockerTag=${{needs.tag.outputs.build-tag}} else diff --git a/Cargo.lock b/Cargo.lock index 7fa5df29fd..c5af247e8b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3749,6 +3749,7 @@ dependencies = [ "tracing", "url", "utils", + "wal_decoder", "walkdir", "workspace_hack", ] @@ -4186,6 +4187,7 @@ dependencies = [ "regex", "serde", "thiserror", + "tracing", "utils", ] @@ -6272,7 +6274,7 @@ dependencies = [ [[package]] name = "tokio-epoll-uring" version = "0.1.0" -source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1" dependencies = [ "futures", "nix 0.26.4", @@ -6788,7 +6790,7 @@ dependencies = [ [[package]] name = "uring-common" version = "0.1.0" -source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168" +source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1" dependencies = [ "bytes", "io-uring", @@ -6954,6 +6956,20 @@ dependencies = [ "utils", ] +[[package]] +name = "wal_decoder" +version = "0.1.0" +dependencies = [ + "anyhow", + "bytes", + "pageserver_api", + "postgres_ffi", + "serde", + "tracing", + "utils", + "workspace_hack", +] + [[package]] name = "walkdir" version = "2.3.3" diff --git a/Cargo.toml b/Cargo.toml index 4c6a24ecde..7f9a766ff9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,6 +33,7 @@ members = [ "libs/postgres_ffi/wal_craft", "libs/vm_monitor", "libs/walproposer", + "libs/wal_decoder", ] [workspace.package] @@ -238,6 +239,7 @@ tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" } utils = { version = "0.1", path = "./libs/utils/" } vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" } walproposer = { version = "0.1", path = "./libs/walproposer/" } +wal_decoder = { version = "0.1", path = "./libs/wal_decoder" } ## Common library dependency workspace_hack = { version = "0.1", path = "./workspace_hack/" } diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile index 6451e309f0..85fb9c441d 100644 --- a/compute/compute-node.Dockerfile +++ b/compute/compute-node.Dockerfile @@ -666,7 +666,7 @@ RUN apt-get update && \ # # Use new version only for v17 # because Release_2024_09_1 has some backward incompatible changes -# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1 +# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1 ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH" RUN case "${PG_VERSION}" in \ "v17") \ @@ -860,18 +860,98 @@ ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH" USER nonroot WORKDIR /home/nonroot -RUN case "${PG_VERSION}" in "v17") \ - echo "v17 is not supported yet by pgrx. Quit" && exit 0;; \ - esac && \ - curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ +RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ chmod +x rustup-init && \ ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ rm rustup-init && \ + case "${PG_VERSION}" in \ + 'v17') \ + echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \ + esac && \ cargo install --locked --version 0.11.3 cargo-pgrx && \ /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' USER root +######################################################################################### +# +# Layer "rust extensions pgrx12" +# +# pgrx started to support Postgres 17 since version 12, +# but some older extension aren't compatible with it. +# This layer should be used as a base for new pgrx extensions, +# and eventually get merged with `rust-extensions-build` +# +######################################################################################### +FROM build-deps AS rust-extensions-build-pgrx12 +ARG PG_VERSION +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +RUN apt-get update && \ + apt-get install --no-install-recommends -y curl libclang-dev && \ + useradd -ms /bin/bash nonroot -b /home + +ENV HOME=/home/nonroot +ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH" +USER nonroot +WORKDIR /home/nonroot + +RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \ + chmod +x rustup-init && \ + ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \ + rm rustup-init && \ + cargo install --locked --version 0.12.6 cargo-pgrx && \ + /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config' + +USER root + +######################################################################################### +# +# Layers "pg-onnx-build" and "pgrag-pg-build" +# Compile "pgrag" extensions +# +######################################################################################### + +FROM rust-extensions-build-pgrx12 AS pg-onnx-build + +# cmake 3.26 or higher is required, so installing it using pip (bullseye-backports has cmake 3.25). +# Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise +RUN apt-get update && apt-get install -y python3 python3-pip python3-venv && \ + python3 -m venv venv && \ + . venv/bin/activate && \ + python3 -m pip install cmake==3.30.5 && \ + wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \ + mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \ + ./build.sh --config Release --parallel --skip_submodule_sync --skip_tests --allow_running_as_root + + +FROM pg-onnx-build AS pgrag-pg-build + +RUN apt-get install -y protobuf-compiler && \ + wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz && \ + echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \ + mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \ + \ + cd exts/rag && \ + sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + cargo pgrx install --release && \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \ + \ + cd ../rag_bge_small_en_v15 && \ + sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \ + REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \ + cargo pgrx install --release --features remote_onnx && \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \ + \ + cd ../rag_jina_reranker_v1_tiny_en && \ + sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ + ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \ + REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \ + cargo pgrx install --release --features remote_onnx && \ + echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_jina_reranker_v1_tiny_en.control + + ######################################################################################### # # Layer "pg-jsonschema-pg-build" @@ -1041,6 +1121,31 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz make -j $(getconf _NPROCESSORS_ONLN) install && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control +######################################################################################### +# +# Layer "pg_mooncake" +# compile pg_mooncake extension +# +######################################################################################### +FROM rust-extensions-build AS pg-mooncake-build +ARG PG_VERSION +COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ + +ENV PG_MOONCAKE_VERSION=882175dbba07ba2e6e59b1088d61bf325b910b9e +ENV PATH="/usr/local/pgsql/bin/:$PATH" + +RUN case "${PG_VERSION}" in \ + 'v14') \ + echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \ + esac && \ + git clone --depth 1 --branch neon https://github.com/kelvich/pg_mooncake.git pg_mooncake-src && \ + cd pg_mooncake-src && \ + git checkout "${PG_MOONCAKE_VERSION}" && \ + git submodule update --init --depth 1 --recursive && \ + make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) && \ + make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \ + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control + ######################################################################################### # # Layer "neon-pg-ext-build" @@ -1059,6 +1164,7 @@ COPY --from=h3-pg-build /h3/usr / COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/ @@ -1084,6 +1190,7 @@ COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/ COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/ +COPY --from=pg-mooncake-build /usr/local/pgsql/ /usr/local/pgsql/ COPY pgxn/ pgxn/ RUN make -j $(getconf _NPROCESSORS_ONLN) \ @@ -1247,6 +1354,7 @@ COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/ COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/ COPY --from=vector-pg-build /pgvector.patch /ext-src/ COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src +#COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/ #COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src #COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml index 51a55b513f..79f894c289 100644 --- a/compute/vm-image-spec-bookworm.yaml +++ b/compute/vm-image-spec-bookworm.yaml @@ -18,7 +18,7 @@ commands: - name: pgbouncer user: postgres sysvInitAction: respawn - shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini' + shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0' - name: local_proxy user: postgres sysvInitAction: respawn diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml index 43e57a4ed5..ff04b9e4c6 100644 --- a/compute/vm-image-spec-bullseye.yaml +++ b/compute/vm-image-spec-bullseye.yaml @@ -18,7 +18,7 @@ commands: - name: pgbouncer user: postgres sysvInitAction: respawn - shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini' + shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0' - name: local_proxy user: postgres sysvInitAction: respawn diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index 5b5828c6ed..8df0a714ec 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -17,7 +17,7 @@ use std::time::Duration; use anyhow::{bail, Context}; use camino::Utf8PathBuf; -use pageserver_api::models::{self, AuxFilePolicy, TenantInfo, TimelineInfo}; +use pageserver_api::models::{self, TenantInfo, TimelineInfo}; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; use postgres_backend::AuthType; @@ -399,11 +399,6 @@ impl PageServerNode { .map(serde_json::from_str) .transpose() .context("parse `timeline_get_throttle` from json")?, - switch_aux_file_policy: settings - .remove("switch_aux_file_policy") - .map(|x| x.parse::()) - .transpose() - .context("Failed to parse 'switch_aux_file_policy'")?, lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()), lsn_lease_length_for_ts: settings .remove("lsn_lease_length_for_ts") @@ -499,11 +494,6 @@ impl PageServerNode { .map(serde_json::from_str) .transpose() .context("parse `timeline_get_throttle` from json")?, - switch_aux_file_policy: settings - .remove("switch_aux_file_policy") - .map(|x| x.parse::()) - .transpose() - .context("Failed to parse 'switch_aux_file_policy'")?, lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()), lsn_lease_length_for_ts: settings .remove("lsn_lease_length_for_ts") diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 73d89699ed..b7f38c6286 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -111,6 +111,11 @@ enum Command { #[arg(long)] node: NodeId, }, + /// Cancel any ongoing reconciliation for this shard + TenantShardCancelReconcile { + #[arg(long)] + tenant_shard_id: TenantShardId, + }, /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure /// that is passed through to pageservers, and does not affect storage controller behavior. TenantConfig { @@ -535,6 +540,15 @@ async fn main() -> anyhow::Result<()> { ) .await?; } + Command::TenantShardCancelReconcile { tenant_shard_id } => { + storcon_client + .dispatch::<(), ()>( + Method::PUT, + format!("control/v1/tenant/{tenant_shard_id}/cancel_reconcile"), + None, + ) + .await?; + } Command::TenantConfig { tenant_id, config } => { let tenant_conf = serde_json::from_str(&config)?; diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 896a5d8069..6b2d6cf625 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -250,12 +250,6 @@ pub struct TenantConfigToml { // Expresed in multiples of checkpoint distance. pub image_layer_creation_check_threshold: u8, - /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into - /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions. - /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux - /// file is written. - pub switch_aux_file_policy: crate::models::AuxFilePolicy, - /// The length for an explicit LSN lease request. /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval. #[serde(with = "humantime_serde")] @@ -475,7 +469,6 @@ impl Default for TenantConfigToml { lazy_slru_download: false, timeline_get_throttle: crate::models::ThrottleConfig::disabled(), image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD, - switch_aux_file_policy: crate::models::AuxFilePolicy::default_tenant_config(), lsn_lease_length: LsnLease::DEFAULT_LENGTH, lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS, } diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs index 532185a366..ff705e79cd 100644 --- a/libs/pageserver_api/src/lib.rs +++ b/libs/pageserver_api/src/lib.rs @@ -5,9 +5,11 @@ pub mod controller_api; pub mod key; pub mod keyspace; pub mod models; +pub mod record; pub mod reltag; pub mod shard; /// Public API types pub mod upcall_api; +pub mod value; pub mod config; diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index d37f62185c..0a4992aea4 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -10,7 +10,6 @@ use std::{ io::{BufRead, Read}, num::{NonZeroU32, NonZeroU64, NonZeroUsize}, str::FromStr, - sync::atomic::AtomicUsize, time::{Duration, SystemTime}, }; @@ -309,7 +308,6 @@ pub struct TenantConfig { pub lazy_slru_download: Option, pub timeline_get_throttle: Option, pub image_layer_creation_check_threshold: Option, - pub switch_aux_file_policy: Option, pub lsn_lease_length: Option, pub lsn_lease_length_for_ts: Option, } @@ -350,68 +348,6 @@ pub enum AuxFilePolicy { CrossValidation, } -impl AuxFilePolicy { - pub fn is_valid_migration_path(from: Option, to: Self) -> bool { - matches!( - (from, to), - (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2) - ) - } - - /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used. - pub fn default_tenant_config() -> Self { - Self::V2 - } -} - -/// The aux file policy memory flag. Users can store `Option` into this atomic flag. 0 == unspecified. -pub struct AtomicAuxFilePolicy(AtomicUsize); - -impl AtomicAuxFilePolicy { - pub fn new(policy: Option) -> Self { - Self(AtomicUsize::new( - policy.map(AuxFilePolicy::to_usize).unwrap_or_default(), - )) - } - - pub fn load(&self) -> Option { - match self.0.load(std::sync::atomic::Ordering::Acquire) { - 0 => None, - other => Some(AuxFilePolicy::from_usize(other)), - } - } - - pub fn store(&self, policy: Option) { - self.0.store( - policy.map(AuxFilePolicy::to_usize).unwrap_or_default(), - std::sync::atomic::Ordering::Release, - ); - } -} - -impl AuxFilePolicy { - pub fn to_usize(self) -> usize { - match self { - Self::V1 => 1, - Self::CrossValidation => 2, - Self::V2 => 3, - } - } - - pub fn try_from_usize(this: usize) -> Option { - match this { - 1 => Some(Self::V1), - 2 => Some(Self::CrossValidation), - 3 => Some(Self::V2), - _ => None, - } - } - - pub fn from_usize(this: usize) -> Self { - Self::try_from_usize(this).unwrap() - } -} - #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] #[serde(tag = "kind")] pub enum EvictionPolicy { @@ -1633,71 +1569,6 @@ mod tests { } } - #[test] - fn test_aux_file_migration_path() { - assert!(AuxFilePolicy::is_valid_migration_path( - None, - AuxFilePolicy::V1 - )); - assert!(AuxFilePolicy::is_valid_migration_path( - None, - AuxFilePolicy::V2 - )); - assert!(AuxFilePolicy::is_valid_migration_path( - None, - AuxFilePolicy::CrossValidation - )); - // Self-migration is not a valid migration path, and the caller should handle it by itself. - assert!(!AuxFilePolicy::is_valid_migration_path( - Some(AuxFilePolicy::V1), - AuxFilePolicy::V1 - )); - assert!(!AuxFilePolicy::is_valid_migration_path( - Some(AuxFilePolicy::V2), - AuxFilePolicy::V2 - )); - assert!(!AuxFilePolicy::is_valid_migration_path( - Some(AuxFilePolicy::CrossValidation), - AuxFilePolicy::CrossValidation - )); - // Migrations not allowed - assert!(!AuxFilePolicy::is_valid_migration_path( - Some(AuxFilePolicy::CrossValidation), - AuxFilePolicy::V1 - )); - assert!(!AuxFilePolicy::is_valid_migration_path( - Some(AuxFilePolicy::V1), - AuxFilePolicy::V2 - )); - assert!(!AuxFilePolicy::is_valid_migration_path( - Some(AuxFilePolicy::V2), - AuxFilePolicy::V1 - )); - assert!(!AuxFilePolicy::is_valid_migration_path( - Some(AuxFilePolicy::V2), - AuxFilePolicy::CrossValidation - )); - assert!(!AuxFilePolicy::is_valid_migration_path( - Some(AuxFilePolicy::V1), - AuxFilePolicy::CrossValidation - )); - // Migrations allowed - assert!(AuxFilePolicy::is_valid_migration_path( - Some(AuxFilePolicy::CrossValidation), - AuxFilePolicy::V2 - )); - } - - #[test] - fn test_aux_parse() { - assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2); - assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2); - assert_eq!( - AuxFilePolicy::from_str("cross-validation").unwrap(), - AuxFilePolicy::CrossValidation - ); - } - #[test] fn test_image_compression_algorithm_parsing() { use ImageCompressionAlgorithm::*; diff --git a/libs/pageserver_api/src/record.rs b/libs/pageserver_api/src/record.rs new file mode 100644 index 0000000000..b80ed2f203 --- /dev/null +++ b/libs/pageserver_api/src/record.rs @@ -0,0 +1,113 @@ +//! This module defines the WAL record format used within the pageserver. + +use bytes::Bytes; +use postgres_ffi::walrecord::{describe_postgres_wal_record, MultiXactMember}; +use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId}; +use serde::{Deserialize, Serialize}; +use utils::bin_ser::DeserializeError; + +/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper +/// around a PostgreSQL WAL record, or a custom neon-specific "record". +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub enum NeonWalRecord { + /// Native PostgreSQL WAL record + Postgres { will_init: bool, rec: Bytes }, + + /// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear) + ClearVisibilityMapFlags { + new_heap_blkno: Option, + old_heap_blkno: Option, + flags: u8, + }, + /// Mark transaction IDs as committed on a CLOG page + ClogSetCommitted { + xids: Vec, + timestamp: TimestampTz, + }, + /// Mark transaction IDs as aborted on a CLOG page + ClogSetAborted { xids: Vec }, + /// Extend multixact offsets SLRU + MultixactOffsetCreate { + mid: MultiXactId, + moff: MultiXactOffset, + }, + /// Extend multixact members SLRU. + MultixactMembersCreate { + moff: MultiXactOffset, + members: Vec, + }, + /// Update the map of AUX files, either writing or dropping an entry + AuxFile { + file_path: String, + content: Option, + }, + + /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it. + #[cfg(feature = "testing")] + Test { + /// Append a string to the image. + append: String, + /// Clear the image before appending. + clear: bool, + /// Treat this record as an init record. `clear` should be set to true if this field is set + /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and + /// its references in `timeline.rs`. + will_init: bool, + }, +} + +impl NeonWalRecord { + /// Does replaying this WAL record initialize the page from scratch, or does + /// it need to be applied over the previous image of the page? + pub fn will_init(&self) -> bool { + // If you change this function, you'll also need to change ValueBytes::will_init + match self { + NeonWalRecord::Postgres { will_init, rec: _ } => *will_init, + #[cfg(feature = "testing")] + NeonWalRecord::Test { will_init, .. } => *will_init, + // None of the special neon record types currently initialize the page + _ => false, + } + } + + #[cfg(feature = "testing")] + pub fn wal_append(s: impl AsRef) -> Self { + Self::Test { + append: s.as_ref().to_string(), + clear: false, + will_init: false, + } + } + + #[cfg(feature = "testing")] + pub fn wal_clear() -> Self { + Self::Test { + append: "".to_string(), + clear: true, + will_init: false, + } + } + + #[cfg(feature = "testing")] + pub fn wal_init() -> Self { + Self::Test { + append: "".to_string(), + clear: true, + will_init: true, + } + } +} + +/// Build a human-readable string to describe a WAL record +/// +/// For debugging purposes +pub fn describe_wal_record(rec: &NeonWalRecord) -> Result { + match rec { + NeonWalRecord::Postgres { will_init, rec } => Ok(format!( + "will_init: {}, {}", + will_init, + describe_postgres_wal_record(rec)? + )), + _ => Ok(format!("{:?}", rec)), + } +} diff --git a/pageserver/src/repository.rs b/libs/pageserver_api/src/value.rs similarity index 73% rename from pageserver/src/repository.rs rename to libs/pageserver_api/src/value.rs index e4ebafd927..1f8ed30a9a 100644 --- a/pageserver/src/repository.rs +++ b/libs/pageserver_api/src/value.rs @@ -1,13 +1,16 @@ -use crate::walrecord::NeonWalRecord; -use anyhow::Result; +//! This module defines the value type used by the storage engine. +//! +//! A [`Value`] represents either a completely new value for one Key ([`Value::Image`]), +//! or a "delta" of how to get from previous version of the value to the new one +//! ([`Value::WalRecord`]]) +//! +//! Note that the [`Value`] type is used for the permananent storage format, so any +//! changes to it must be backwards compatible. + +use crate::record::NeonWalRecord; use bytes::Bytes; use serde::{Deserialize, Serialize}; -use std::ops::AddAssign; -use std::time::Duration; -pub use pageserver_api::key::{Key, KEY_SIZE}; - -/// A 'value' stored for a one Key. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub enum Value { /// An Image value contains a full copy of the value @@ -20,10 +23,12 @@ pub enum Value { } impl Value { + #[inline(always)] pub fn is_image(&self) -> bool { matches!(self, Value::Image(_)) } + #[inline(always)] pub fn will_init(&self) -> bool { match self { Value::Image(_) => true, @@ -33,17 +38,18 @@ impl Value { } #[derive(Debug, PartialEq)] -pub(crate) enum InvalidInput { +pub enum InvalidInput { TooShortValue, TooShortPostgresRecord, } /// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets /// use this type for querying if a slice looks some particular way. -pub(crate) struct ValueBytes; +pub struct ValueBytes; impl ValueBytes { - pub(crate) fn will_init(raw: &[u8]) -> Result { + #[inline(always)] + pub fn will_init(raw: &[u8]) -> Result { if raw.len() < 12 { return Err(InvalidInput::TooShortValue); } @@ -79,6 +85,7 @@ impl ValueBytes { mod test { use super::*; + use bytes::Bytes; use utils::bin_ser::BeSer; macro_rules! roundtrip { @@ -229,56 +236,3 @@ mod test { assert!(!ValueBytes::will_init(&expected).unwrap()); } } - -/// -/// Result of performing GC -/// -#[derive(Default, Serialize, Debug)] -pub struct GcResult { - pub layers_total: u64, - pub layers_needed_by_cutoff: u64, - pub layers_needed_by_pitr: u64, - pub layers_needed_by_branches: u64, - pub layers_needed_by_leases: u64, - pub layers_not_updated: u64, - pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. - - #[serde(serialize_with = "serialize_duration_as_millis")] - pub elapsed: Duration, - - /// The layers which were garbage collected. - /// - /// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be - /// dropped in tests. - #[cfg(feature = "testing")] - #[serde(skip)] - pub(crate) doomed_layers: Vec, -} - -// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds -fn serialize_duration_as_millis(d: &Duration, serializer: S) -> Result -where - S: serde::Serializer, -{ - d.as_millis().serialize(serializer) -} - -impl AddAssign for GcResult { - fn add_assign(&mut self, other: Self) { - self.layers_total += other.layers_total; - self.layers_needed_by_pitr += other.layers_needed_by_pitr; - self.layers_needed_by_cutoff += other.layers_needed_by_cutoff; - self.layers_needed_by_branches += other.layers_needed_by_branches; - self.layers_needed_by_leases += other.layers_needed_by_leases; - self.layers_not_updated += other.layers_not_updated; - self.layers_removed += other.layers_removed; - - self.elapsed += other.elapsed; - - #[cfg(feature = "testing")] - { - let mut other = other; - self.doomed_layers.append(&mut other.doomed_layers); - } - } -} diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml index ef17833a48..e1f5443cbe 100644 --- a/libs/postgres_ffi/Cargo.toml +++ b/libs/postgres_ffi/Cargo.toml @@ -15,6 +15,7 @@ memoffset.workspace = true thiserror.workspace = true serde.workspace = true utils.workspace = true +tracing.workspace = true [dev-dependencies] env_logger.workspace = true diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs index 0d46ed6aac..6b219488ac 100644 --- a/libs/postgres_ffi/src/lib.rs +++ b/libs/postgres_ffi/src/lib.rs @@ -217,6 +217,7 @@ macro_rules! enum_pgversion { pub mod pg_constants; pub mod relfile_utils; +pub mod walrecord; // Export some widely used datatypes that are unlikely to change across Postgres versions pub use v14::bindings::RepOriginId; diff --git a/pageserver/src/walrecord.rs b/libs/postgres_ffi/src/walrecord.rs similarity index 88% rename from pageserver/src/walrecord.rs rename to libs/postgres_ffi/src/walrecord.rs index dd199e2c55..dedbaef64d 100644 --- a/pageserver/src/walrecord.rs +++ b/libs/postgres_ffi/src/walrecord.rs @@ -1,107 +1,144 @@ +//! This module houses types used in decoding of PG WAL +//! records. //! -//! Functions for parsing WAL records. -//! +//! TODO: Generate separate types for each supported PG version -use anyhow::Result; +use crate::pg_constants; +use crate::XLogRecord; +use crate::{ + BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, RepOriginId, TimestampTz, + TransactionId, +}; +use crate::{BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD}; use bytes::{Buf, Bytes}; -use postgres_ffi::dispatch_pgversion; -use postgres_ffi::pg_constants; -use postgres_ffi::BLCKSZ; -use postgres_ffi::{BlockNumber, TimestampTz}; -use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId}; -use postgres_ffi::{RepOriginId, XLogRecord, XLOG_SIZE_OF_XLOG_RECORD}; use serde::{Deserialize, Serialize}; -use tracing::*; -use utils::{bin_ser::DeserializeError, lsn::Lsn}; +use utils::bin_ser::DeserializeError; +use utils::lsn::Lsn; -/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper -/// around a PostgreSQL WAL record, or a custom neon-specific "record". -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -pub enum NeonWalRecord { - /// Native PostgreSQL WAL record - Postgres { will_init: bool, rec: Bytes }, - - /// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear) - ClearVisibilityMapFlags { - new_heap_blkno: Option, - old_heap_blkno: Option, - flags: u8, - }, - /// Mark transaction IDs as committed on a CLOG page - ClogSetCommitted { - xids: Vec, - timestamp: TimestampTz, - }, - /// Mark transaction IDs as aborted on a CLOG page - ClogSetAborted { xids: Vec }, - /// Extend multixact offsets SLRU - MultixactOffsetCreate { - mid: MultiXactId, - moff: MultiXactOffset, - }, - /// Extend multixact members SLRU. - MultixactMembersCreate { - moff: MultiXactOffset, - members: Vec, - }, - /// Update the map of AUX files, either writing or dropping an entry - AuxFile { - file_path: String, - content: Option, - }, - - /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it. - #[cfg(test)] - Test { - /// Append a string to the image. - append: String, - /// Clear the image before appending. - clear: bool, - /// Treat this record as an init record. `clear` should be set to true if this field is set - /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and - /// its references in `timeline.rs`. - will_init: bool, - }, +#[repr(C)] +#[derive(Debug)] +pub struct XlMultiXactCreate { + pub mid: MultiXactId, + /* new MultiXact's ID */ + pub moff: MultiXactOffset, + /* its starting offset in members file */ + pub nmembers: u32, + /* number of member XIDs */ + pub members: Vec, } -impl NeonWalRecord { - /// Does replaying this WAL record initialize the page from scratch, or does - /// it need to be applied over the previous image of the page? - pub fn will_init(&self) -> bool { - // If you change this function, you'll also need to change ValueBytes::will_init - match self { - NeonWalRecord::Postgres { will_init, rec: _ } => *will_init, - #[cfg(test)] - NeonWalRecord::Test { will_init, .. } => *will_init, - // None of the special neon record types currently initialize the page - _ => false, +impl XlMultiXactCreate { + pub fn decode(buf: &mut Bytes) -> XlMultiXactCreate { + let mid = buf.get_u32_le(); + let moff = buf.get_u32_le(); + let nmembers = buf.get_u32_le(); + let mut members = Vec::new(); + for _ in 0..nmembers { + members.push(MultiXactMember::decode(buf)); + } + XlMultiXactCreate { + mid, + moff, + nmembers, + members, } } +} - #[cfg(test)] - pub(crate) fn wal_append(s: impl AsRef) -> Self { - Self::Test { - append: s.as_ref().to_string(), - clear: false, - will_init: false, +#[repr(C)] +#[derive(Debug)] +pub struct XlMultiXactTruncate { + pub oldest_multi_db: Oid, + /* to-be-truncated range of multixact offsets */ + pub start_trunc_off: MultiXactId, + /* just for completeness' sake */ + pub end_trunc_off: MultiXactId, + + /* to-be-truncated range of multixact members */ + pub start_trunc_memb: MultiXactOffset, + pub end_trunc_memb: MultiXactOffset, +} + +impl XlMultiXactTruncate { + pub fn decode(buf: &mut Bytes) -> XlMultiXactTruncate { + XlMultiXactTruncate { + oldest_multi_db: buf.get_u32_le(), + start_trunc_off: buf.get_u32_le(), + end_trunc_off: buf.get_u32_le(), + start_trunc_memb: buf.get_u32_le(), + end_trunc_memb: buf.get_u32_le(), } } +} - #[cfg(test)] - pub(crate) fn wal_clear() -> Self { - Self::Test { - append: "".to_string(), - clear: true, - will_init: false, +#[repr(C)] +#[derive(Debug)] +pub struct XlRelmapUpdate { + pub dbid: Oid, /* database ID, or 0 for shared map */ + pub tsid: Oid, /* database's tablespace, or pg_global */ + pub nbytes: i32, /* size of relmap data */ +} + +impl XlRelmapUpdate { + pub fn decode(buf: &mut Bytes) -> XlRelmapUpdate { + XlRelmapUpdate { + dbid: buf.get_u32_le(), + tsid: buf.get_u32_le(), + nbytes: buf.get_i32_le(), } } +} - #[cfg(test)] - pub(crate) fn wal_init() -> Self { - Self::Test { - append: "".to_string(), - clear: true, - will_init: true, +#[repr(C)] +#[derive(Debug)] +pub struct XlReploriginDrop { + pub node_id: RepOriginId, +} + +impl XlReploriginDrop { + pub fn decode(buf: &mut Bytes) -> XlReploriginDrop { + XlReploriginDrop { + node_id: buf.get_u16_le(), + } + } +} + +#[repr(C)] +#[derive(Debug)] +pub struct XlReploriginSet { + pub remote_lsn: Lsn, + pub node_id: RepOriginId, +} + +impl XlReploriginSet { + pub fn decode(buf: &mut Bytes) -> XlReploriginSet { + XlReploriginSet { + remote_lsn: Lsn(buf.get_u64_le()), + node_id: buf.get_u16_le(), + } + } +} + +#[repr(C)] +#[derive(Debug, Clone, Copy)] +pub struct RelFileNode { + pub spcnode: Oid, /* tablespace */ + pub dbnode: Oid, /* database */ + pub relnode: Oid, /* relation */ +} + +#[repr(C)] +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct MultiXactMember { + pub xid: TransactionId, + pub status: MultiXactStatus, +} + +impl MultiXactMember { + pub fn decode(buf: &mut Bytes) -> MultiXactMember { + MultiXactMember { + xid: buf.get_u32_le(), + status: buf.get_u32_le(), } } } @@ -164,17 +201,17 @@ impl DecodedWALRecord { /// Check if this WAL record represents a legacy "copy" database creation, which populates new relations /// by reading other existing relations' data blocks. This is more complex to apply than new-style database /// creations which simply include all the desired blocks in the WAL, so we need a helper function to detect this case. - pub(crate) fn is_dbase_create_copy(&self, pg_version: u32) -> bool { + pub fn is_dbase_create_copy(&self, pg_version: u32) -> bool { if self.xl_rmid == pg_constants::RM_DBASE_ID { let info = self.xl_info & pg_constants::XLR_RMGR_INFO_MASK; match pg_version { 14 => { // Postgres 14 database creations are always the legacy kind - info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE + info == crate::v14::bindings::XLOG_DBASE_CREATE } - 15 => info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY, - 16 => info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY, - 17 => info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY, + 15 => info == crate::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY, + 16 => info == crate::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY, + 17 => info == crate::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY, _ => { panic!("Unsupported postgres version {pg_version}") } @@ -185,35 +222,294 @@ impl DecodedWALRecord { } } -#[repr(C)] -#[derive(Debug, Clone, Copy)] -pub struct RelFileNode { - pub spcnode: Oid, /* tablespace */ - pub dbnode: Oid, /* database */ - pub relnode: Oid, /* relation */ -} +/// Main routine to decode a WAL record and figure out which blocks are modified +// +// See xlogrecord.h for details +// The overall layout of an XLOG record is: +// Fixed-size header (XLogRecord struct) +// XLogRecordBlockHeader struct +// If pg_constants::BKPBLOCK_HAS_IMAGE, an XLogRecordBlockImageHeader struct follows +// If pg_constants::BKPIMAGE_HAS_HOLE and pg_constants::BKPIMAGE_IS_COMPRESSED, an +// XLogRecordBlockCompressHeader struct follows. +// If pg_constants::BKPBLOCK_SAME_REL is not set, a RelFileNode follows +// BlockNumber follows +// XLogRecordBlockHeader struct +// ... +// XLogRecordDataHeader[Short|Long] struct +// block data +// block data +// ... +// main data +// +// +// For performance reasons, the caller provides the DecodedWALRecord struct and the function just fills it in. +// It would be more natural for this function to return a DecodedWALRecord as return value, +// but reusing the caller-supplied struct avoids an allocation. +// This code is in the hot path for digesting incoming WAL, and is very performance sensitive. +// +pub fn decode_wal_record( + record: Bytes, + decoded: &mut DecodedWALRecord, + pg_version: u32, +) -> anyhow::Result<()> { + let mut rnode_spcnode: u32 = 0; + let mut rnode_dbnode: u32 = 0; + let mut rnode_relnode: u32 = 0; + let mut got_rnode = false; + let mut origin_id: u16 = 0; -#[repr(C)] -#[derive(Debug)] -pub struct XlRelmapUpdate { - pub dbid: Oid, /* database ID, or 0 for shared map */ - pub tsid: Oid, /* database's tablespace, or pg_global */ - pub nbytes: i32, /* size of relmap data */ -} + let mut buf = record.clone(); -impl XlRelmapUpdate { - pub fn decode(buf: &mut Bytes) -> XlRelmapUpdate { - XlRelmapUpdate { - dbid: buf.get_u32_le(), - tsid: buf.get_u32_le(), - nbytes: buf.get_i32_le(), + // 1. Parse XLogRecord struct + + // FIXME: assume little-endian here + let xlogrec = XLogRecord::from_bytes(&mut buf)?; + + tracing::trace!( + "decode_wal_record xl_rmid = {} xl_info = {}", + xlogrec.xl_rmid, + xlogrec.xl_info + ); + + let remaining: usize = xlogrec.xl_tot_len as usize - XLOG_SIZE_OF_XLOG_RECORD; + + if buf.remaining() != remaining { + //TODO error + } + + let mut max_block_id = 0; + let mut blocks_total_len: u32 = 0; + let mut main_data_len = 0; + let mut datatotal: u32 = 0; + decoded.blocks.clear(); + + // 2. Decode the headers. + // XLogRecordBlockHeaders if any, + // XLogRecordDataHeader[Short|Long] + while buf.remaining() > datatotal as usize { + let block_id = buf.get_u8(); + + match block_id { + pg_constants::XLR_BLOCK_ID_DATA_SHORT => { + /* XLogRecordDataHeaderShort */ + main_data_len = buf.get_u8() as u32; + datatotal += main_data_len; + } + + pg_constants::XLR_BLOCK_ID_DATA_LONG => { + /* XLogRecordDataHeaderLong */ + main_data_len = buf.get_u32_le(); + datatotal += main_data_len; + } + + pg_constants::XLR_BLOCK_ID_ORIGIN => { + // RepOriginId is uint16 + origin_id = buf.get_u16_le(); + } + + pg_constants::XLR_BLOCK_ID_TOPLEVEL_XID => { + // TransactionId is uint32 + buf.advance(4); + } + + 0..=pg_constants::XLR_MAX_BLOCK_ID => { + /* XLogRecordBlockHeader */ + let mut blk = DecodedBkpBlock::new(); + + if block_id <= max_block_id { + // TODO + //report_invalid_record(state, + // "out-of-order block_id %u at %X/%X", + // block_id, + // (uint32) (state->ReadRecPtr >> 32), + // (uint32) state->ReadRecPtr); + // goto err; + } + max_block_id = block_id; + + let fork_flags: u8 = buf.get_u8(); + blk.forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK; + blk.flags = fork_flags; + blk.has_image = (fork_flags & pg_constants::BKPBLOCK_HAS_IMAGE) != 0; + blk.has_data = (fork_flags & pg_constants::BKPBLOCK_HAS_DATA) != 0; + blk.will_init = (fork_flags & pg_constants::BKPBLOCK_WILL_INIT) != 0; + blk.data_len = buf.get_u16_le(); + + /* TODO cross-check that the HAS_DATA flag is set iff data_length > 0 */ + + datatotal += blk.data_len as u32; + blocks_total_len += blk.data_len as u32; + + if blk.has_image { + blk.bimg_len = buf.get_u16_le(); + blk.hole_offset = buf.get_u16_le(); + blk.bimg_info = buf.get_u8(); + + blk.apply_image = dispatch_pgversion!( + pg_version, + (blk.bimg_info & pgv::bindings::BKPIMAGE_APPLY) != 0 + ); + + let blk_img_is_compressed = + crate::bkpimage_is_compressed(blk.bimg_info, pg_version); + + if blk_img_is_compressed { + tracing::debug!("compressed block image , pg_version = {}", pg_version); + } + + if blk_img_is_compressed { + if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 { + blk.hole_length = buf.get_u16_le(); + } else { + blk.hole_length = 0; + } + } else { + blk.hole_length = BLCKSZ - blk.bimg_len; + } + datatotal += blk.bimg_len as u32; + blocks_total_len += blk.bimg_len as u32; + + /* + * cross-check that hole_offset > 0, hole_length > 0 and + * bimg_len < BLCKSZ if the HAS_HOLE flag is set. + */ + if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 + && (blk.hole_offset == 0 || blk.hole_length == 0 || blk.bimg_len == BLCKSZ) + { + // TODO + /* + report_invalid_record(state, + "pg_constants::BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X", + (unsigned int) blk->hole_offset, + (unsigned int) blk->hole_length, + (unsigned int) blk->bimg_len, + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); + goto err; + */ + } + + /* + * cross-check that hole_offset == 0 and hole_length == 0 if + * the HAS_HOLE flag is not set. + */ + if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0 + && (blk.hole_offset != 0 || blk.hole_length != 0) + { + // TODO + /* + report_invalid_record(state, + "pg_constants::BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X", + (unsigned int) blk->hole_offset, + (unsigned int) blk->hole_length, + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); + goto err; + */ + } + + /* + * cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED + * flag is set. + */ + if !blk_img_is_compressed && blk.bimg_len == BLCKSZ { + // TODO + /* + report_invalid_record(state, + "pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X", + (unsigned int) blk->bimg_len, + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); + goto err; + */ + } + + /* + * cross-check that bimg_len = BLCKSZ if neither HAS_HOLE nor + * IS_COMPRESSED flag is set. + */ + if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0 + && !blk_img_is_compressed + && blk.bimg_len != BLCKSZ + { + // TODO + /* + report_invalid_record(state, + "neither pg_constants::BKPIMAGE_HAS_HOLE nor pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X", + (unsigned int) blk->data_len, + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); + goto err; + */ + } + } + if fork_flags & pg_constants::BKPBLOCK_SAME_REL == 0 { + rnode_spcnode = buf.get_u32_le(); + rnode_dbnode = buf.get_u32_le(); + rnode_relnode = buf.get_u32_le(); + got_rnode = true; + } else if !got_rnode { + // TODO + /* + report_invalid_record(state, + "pg_constants::BKPBLOCK_SAME_REL set but no previous rel at %X/%X", + (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); + goto err; */ + } + + blk.rnode_spcnode = rnode_spcnode; + blk.rnode_dbnode = rnode_dbnode; + blk.rnode_relnode = rnode_relnode; + + blk.blkno = buf.get_u32_le(); + tracing::trace!( + "this record affects {}/{}/{} blk {}", + rnode_spcnode, + rnode_dbnode, + rnode_relnode, + blk.blkno + ); + + decoded.blocks.push(blk); + } + + _ => { + // TODO: invalid block_id + } } } + + // 3. Decode blocks. + let mut ptr = record.len() - buf.remaining(); + for blk in decoded.blocks.iter_mut() { + if blk.has_image { + blk.bimg_offset = ptr as u32; + ptr += blk.bimg_len as usize; + } + if blk.has_data { + ptr += blk.data_len as usize; + } + } + // We don't need them, so just skip blocks_total_len bytes + buf.advance(blocks_total_len as usize); + assert_eq!(ptr, record.len() - buf.remaining()); + + let main_data_offset = (xlogrec.xl_tot_len - main_data_len) as usize; + + // 4. Decode main_data + if main_data_len > 0 { + assert_eq!(buf.remaining(), main_data_len as usize); + } + + decoded.xl_xid = xlogrec.xl_xid; + decoded.xl_info = xlogrec.xl_info; + decoded.xl_rmid = xlogrec.xl_rmid; + decoded.record = record; + decoded.origin_id = origin_id; + decoded.main_data_offset = main_data_offset; + + Ok(()) } pub mod v14 { + use crate::{OffsetNumber, TransactionId}; use bytes::{Buf, Bytes}; - use postgres_ffi::{OffsetNumber, TransactionId}; #[repr(C)] #[derive(Debug)] @@ -383,8 +679,8 @@ pub mod v15 { pub mod v16 { pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert, XlParameterChange}; + use crate::{OffsetNumber, TransactionId}; use bytes::{Buf, Bytes}; - use postgres_ffi::{OffsetNumber, TransactionId}; pub struct XlHeapDelete { pub xmax: TransactionId, @@ -450,8 +746,8 @@ pub mod v16 { /* Since PG16, we have the Neon RMGR (RM_NEON_ID) to manage Neon-flavored WAL. */ pub mod rm_neon { + use crate::{OffsetNumber, TransactionId}; use bytes::{Buf, Bytes}; - use postgres_ffi::{OffsetNumber, TransactionId}; #[repr(C)] #[derive(Debug)] @@ -563,8 +859,8 @@ pub mod v16 { pub mod v17 { pub use super::v14::XlHeapLockUpdated; + pub use crate::{TimeLineID, TimestampTz}; use bytes::{Buf, Bytes}; - pub use postgres_ffi::{TimeLineID, TimestampTz}; pub use super::v16::rm_neon; pub use super::v16::{ @@ -742,7 +1038,7 @@ impl XlXactParsedRecord { let spcnode = buf.get_u32_le(); let dbnode = buf.get_u32_le(); let relnode = buf.get_u32_le(); - trace!( + tracing::trace!( "XLOG_XACT_COMMIT relfilenode {}/{}/{}", spcnode, dbnode, @@ -756,9 +1052,9 @@ impl XlXactParsedRecord { } } - if xinfo & postgres_ffi::v15::bindings::XACT_XINFO_HAS_DROPPED_STATS != 0 { + if xinfo & crate::v15::bindings::XACT_XINFO_HAS_DROPPED_STATS != 0 { let nitems = buf.get_i32_le(); - debug!( + tracing::debug!( "XLOG_XACT_COMMIT-XACT_XINFO_HAS_DROPPED_STAT nitems {}", nitems ); @@ -778,7 +1074,7 @@ impl XlXactParsedRecord { if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 { xid = buf.get_u32_le(); - debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid); + tracing::debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid); } let origin_lsn = if xinfo & pg_constants::XACT_XINFO_HAS_ORIGIN != 0 { @@ -822,78 +1118,6 @@ impl XlClogTruncate { } } -#[repr(C)] -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -pub struct MultiXactMember { - pub xid: TransactionId, - pub status: MultiXactStatus, -} - -impl MultiXactMember { - pub fn decode(buf: &mut Bytes) -> MultiXactMember { - MultiXactMember { - xid: buf.get_u32_le(), - status: buf.get_u32_le(), - } - } -} - -#[repr(C)] -#[derive(Debug)] -pub struct XlMultiXactCreate { - pub mid: MultiXactId, - /* new MultiXact's ID */ - pub moff: MultiXactOffset, - /* its starting offset in members file */ - pub nmembers: u32, - /* number of member XIDs */ - pub members: Vec, -} - -impl XlMultiXactCreate { - pub fn decode(buf: &mut Bytes) -> XlMultiXactCreate { - let mid = buf.get_u32_le(); - let moff = buf.get_u32_le(); - let nmembers = buf.get_u32_le(); - let mut members = Vec::new(); - for _ in 0..nmembers { - members.push(MultiXactMember::decode(buf)); - } - XlMultiXactCreate { - mid, - moff, - nmembers, - members, - } - } -} - -#[repr(C)] -#[derive(Debug)] -pub struct XlMultiXactTruncate { - pub oldest_multi_db: Oid, - /* to-be-truncated range of multixact offsets */ - pub start_trunc_off: MultiXactId, - /* just for completeness' sake */ - pub end_trunc_off: MultiXactId, - - /* to-be-truncated range of multixact members */ - pub start_trunc_memb: MultiXactOffset, - pub end_trunc_memb: MultiXactOffset, -} - -impl XlMultiXactTruncate { - pub fn decode(buf: &mut Bytes) -> XlMultiXactTruncate { - XlMultiXactTruncate { - oldest_multi_db: buf.get_u32_le(), - start_trunc_off: buf.get_u32_le(), - end_trunc_off: buf.get_u32_le(), - start_trunc_memb: buf.get_u32_le(), - end_trunc_memb: buf.get_u32_le(), - } - } -} - #[repr(C)] #[derive(Debug)] pub struct XlLogicalMessage { @@ -950,337 +1174,7 @@ impl XlRunningXacts { } } -#[repr(C)] -#[derive(Debug)] -pub struct XlReploriginDrop { - pub node_id: RepOriginId, -} - -impl XlReploriginDrop { - pub fn decode(buf: &mut Bytes) -> XlReploriginDrop { - XlReploriginDrop { - node_id: buf.get_u16_le(), - } - } -} - -#[repr(C)] -#[derive(Debug)] -pub struct XlReploriginSet { - pub remote_lsn: Lsn, - pub node_id: RepOriginId, -} - -impl XlReploriginSet { - pub fn decode(buf: &mut Bytes) -> XlReploriginSet { - XlReploriginSet { - remote_lsn: Lsn(buf.get_u64_le()), - node_id: buf.get_u16_le(), - } - } -} - -/// Main routine to decode a WAL record and figure out which blocks are modified -// -// See xlogrecord.h for details -// The overall layout of an XLOG record is: -// Fixed-size header (XLogRecord struct) -// XLogRecordBlockHeader struct -// If pg_constants::BKPBLOCK_HAS_IMAGE, an XLogRecordBlockImageHeader struct follows -// If pg_constants::BKPIMAGE_HAS_HOLE and pg_constants::BKPIMAGE_IS_COMPRESSED, an -// XLogRecordBlockCompressHeader struct follows. -// If pg_constants::BKPBLOCK_SAME_REL is not set, a RelFileNode follows -// BlockNumber follows -// XLogRecordBlockHeader struct -// ... -// XLogRecordDataHeader[Short|Long] struct -// block data -// block data -// ... -// main data -// -// -// For performance reasons, the caller provides the DecodedWALRecord struct and the function just fills it in. -// It would be more natural for this function to return a DecodedWALRecord as return value, -// but reusing the caller-supplied struct avoids an allocation. -// This code is in the hot path for digesting incoming WAL, and is very performance sensitive. -// -pub fn decode_wal_record( - record: Bytes, - decoded: &mut DecodedWALRecord, - pg_version: u32, -) -> Result<()> { - let mut rnode_spcnode: u32 = 0; - let mut rnode_dbnode: u32 = 0; - let mut rnode_relnode: u32 = 0; - let mut got_rnode = false; - let mut origin_id: u16 = 0; - - let mut buf = record.clone(); - - // 1. Parse XLogRecord struct - - // FIXME: assume little-endian here - let xlogrec = XLogRecord::from_bytes(&mut buf)?; - - trace!( - "decode_wal_record xl_rmid = {} xl_info = {}", - xlogrec.xl_rmid, - xlogrec.xl_info - ); - - let remaining: usize = xlogrec.xl_tot_len as usize - XLOG_SIZE_OF_XLOG_RECORD; - - if buf.remaining() != remaining { - //TODO error - } - - let mut max_block_id = 0; - let mut blocks_total_len: u32 = 0; - let mut main_data_len = 0; - let mut datatotal: u32 = 0; - decoded.blocks.clear(); - - // 2. Decode the headers. - // XLogRecordBlockHeaders if any, - // XLogRecordDataHeader[Short|Long] - while buf.remaining() > datatotal as usize { - let block_id = buf.get_u8(); - - match block_id { - pg_constants::XLR_BLOCK_ID_DATA_SHORT => { - /* XLogRecordDataHeaderShort */ - main_data_len = buf.get_u8() as u32; - datatotal += main_data_len; - } - - pg_constants::XLR_BLOCK_ID_DATA_LONG => { - /* XLogRecordDataHeaderLong */ - main_data_len = buf.get_u32_le(); - datatotal += main_data_len; - } - - pg_constants::XLR_BLOCK_ID_ORIGIN => { - // RepOriginId is uint16 - origin_id = buf.get_u16_le(); - } - - pg_constants::XLR_BLOCK_ID_TOPLEVEL_XID => { - // TransactionId is uint32 - buf.advance(4); - } - - 0..=pg_constants::XLR_MAX_BLOCK_ID => { - /* XLogRecordBlockHeader */ - let mut blk = DecodedBkpBlock::new(); - - if block_id <= max_block_id { - // TODO - //report_invalid_record(state, - // "out-of-order block_id %u at %X/%X", - // block_id, - // (uint32) (state->ReadRecPtr >> 32), - // (uint32) state->ReadRecPtr); - // goto err; - } - max_block_id = block_id; - - let fork_flags: u8 = buf.get_u8(); - blk.forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK; - blk.flags = fork_flags; - blk.has_image = (fork_flags & pg_constants::BKPBLOCK_HAS_IMAGE) != 0; - blk.has_data = (fork_flags & pg_constants::BKPBLOCK_HAS_DATA) != 0; - blk.will_init = (fork_flags & pg_constants::BKPBLOCK_WILL_INIT) != 0; - blk.data_len = buf.get_u16_le(); - - /* TODO cross-check that the HAS_DATA flag is set iff data_length > 0 */ - - datatotal += blk.data_len as u32; - blocks_total_len += blk.data_len as u32; - - if blk.has_image { - blk.bimg_len = buf.get_u16_le(); - blk.hole_offset = buf.get_u16_le(); - blk.bimg_info = buf.get_u8(); - - blk.apply_image = dispatch_pgversion!( - pg_version, - (blk.bimg_info & pgv::bindings::BKPIMAGE_APPLY) != 0 - ); - - let blk_img_is_compressed = - postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version); - - if blk_img_is_compressed { - debug!("compressed block image , pg_version = {}", pg_version); - } - - if blk_img_is_compressed { - if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 { - blk.hole_length = buf.get_u16_le(); - } else { - blk.hole_length = 0; - } - } else { - blk.hole_length = BLCKSZ - blk.bimg_len; - } - datatotal += blk.bimg_len as u32; - blocks_total_len += blk.bimg_len as u32; - - /* - * cross-check that hole_offset > 0, hole_length > 0 and - * bimg_len < BLCKSZ if the HAS_HOLE flag is set. - */ - if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 - && (blk.hole_offset == 0 || blk.hole_length == 0 || blk.bimg_len == BLCKSZ) - { - // TODO - /* - report_invalid_record(state, - "pg_constants::BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X", - (unsigned int) blk->hole_offset, - (unsigned int) blk->hole_length, - (unsigned int) blk->bimg_len, - (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); - goto err; - */ - } - - /* - * cross-check that hole_offset == 0 and hole_length == 0 if - * the HAS_HOLE flag is not set. - */ - if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0 - && (blk.hole_offset != 0 || blk.hole_length != 0) - { - // TODO - /* - report_invalid_record(state, - "pg_constants::BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X", - (unsigned int) blk->hole_offset, - (unsigned int) blk->hole_length, - (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); - goto err; - */ - } - - /* - * cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED - * flag is set. - */ - if !blk_img_is_compressed && blk.bimg_len == BLCKSZ { - // TODO - /* - report_invalid_record(state, - "pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X", - (unsigned int) blk->bimg_len, - (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); - goto err; - */ - } - - /* - * cross-check that bimg_len = BLCKSZ if neither HAS_HOLE nor - * IS_COMPRESSED flag is set. - */ - if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0 - && !blk_img_is_compressed - && blk.bimg_len != BLCKSZ - { - // TODO - /* - report_invalid_record(state, - "neither pg_constants::BKPIMAGE_HAS_HOLE nor pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X", - (unsigned int) blk->data_len, - (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); - goto err; - */ - } - } - if fork_flags & pg_constants::BKPBLOCK_SAME_REL == 0 { - rnode_spcnode = buf.get_u32_le(); - rnode_dbnode = buf.get_u32_le(); - rnode_relnode = buf.get_u32_le(); - got_rnode = true; - } else if !got_rnode { - // TODO - /* - report_invalid_record(state, - "pg_constants::BKPBLOCK_SAME_REL set but no previous rel at %X/%X", - (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr); - goto err; */ - } - - blk.rnode_spcnode = rnode_spcnode; - blk.rnode_dbnode = rnode_dbnode; - blk.rnode_relnode = rnode_relnode; - - blk.blkno = buf.get_u32_le(); - trace!( - "this record affects {}/{}/{} blk {}", - rnode_spcnode, - rnode_dbnode, - rnode_relnode, - blk.blkno - ); - - decoded.blocks.push(blk); - } - - _ => { - // TODO: invalid block_id - } - } - } - - // 3. Decode blocks. - let mut ptr = record.len() - buf.remaining(); - for blk in decoded.blocks.iter_mut() { - if blk.has_image { - blk.bimg_offset = ptr as u32; - ptr += blk.bimg_len as usize; - } - if blk.has_data { - ptr += blk.data_len as usize; - } - } - // We don't need them, so just skip blocks_total_len bytes - buf.advance(blocks_total_len as usize); - assert_eq!(ptr, record.len() - buf.remaining()); - - let main_data_offset = (xlogrec.xl_tot_len - main_data_len) as usize; - - // 4. Decode main_data - if main_data_len > 0 { - assert_eq!(buf.remaining(), main_data_len as usize); - } - - decoded.xl_xid = xlogrec.xl_xid; - decoded.xl_info = xlogrec.xl_info; - decoded.xl_rmid = xlogrec.xl_rmid; - decoded.record = record; - decoded.origin_id = origin_id; - decoded.main_data_offset = main_data_offset; - - Ok(()) -} - -/// -/// Build a human-readable string to describe a WAL record -/// -/// For debugging purposes -pub fn describe_wal_record(rec: &NeonWalRecord) -> Result { - match rec { - NeonWalRecord::Postgres { will_init, rec } => Ok(format!( - "will_init: {}, {}", - will_init, - describe_postgres_wal_record(rec)? - )), - _ => Ok(format!("{:?}", rec)), - } -} - -fn describe_postgres_wal_record(record: &Bytes) -> Result { +pub fn describe_postgres_wal_record(record: &Bytes) -> Result { // TODO: It would be nice to use the PostgreSQL rmgrdesc infrastructure for this. // Maybe use the postgres wal redo process, the same used for replaying WAL records? // Or could we compile the rmgrdesc routines into the dump_layer_file() binary directly, diff --git a/libs/wal_decoder/Cargo.toml b/libs/wal_decoder/Cargo.toml new file mode 100644 index 0000000000..3f80f8fcdb --- /dev/null +++ b/libs/wal_decoder/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "wal_decoder" +version = "0.1.0" +edition.workspace = true +license.workspace = true + +[features] +testing = [] + +[dependencies] +anyhow.workspace = true +bytes.workspace = true +pageserver_api.workspace = true +postgres_ffi.workspace = true +serde.workspace = true +tracing.workspace = true +utils.workspace = true +workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs new file mode 100644 index 0000000000..8b13789179 --- /dev/null +++ b/libs/wal_decoder/src/decoder.rs @@ -0,0 +1 @@ + diff --git a/libs/wal_decoder/src/lib.rs b/libs/wal_decoder/src/lib.rs new file mode 100644 index 0000000000..05349d17c9 --- /dev/null +++ b/libs/wal_decoder/src/lib.rs @@ -0,0 +1,2 @@ +pub mod decoder; +pub mod models; diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs new file mode 100644 index 0000000000..58f8e1b2da --- /dev/null +++ b/libs/wal_decoder/src/models.rs @@ -0,0 +1,167 @@ +//! This module houses types which represent decoded PG WAL records +//! ready for the pageserver to interpret. They are derived from the original +//! WAL records, so that each struct corresponds closely to one WAL record of +//! a specific kind. They contain the same information as the original WAL records, +//! just decoded into structs and fields for easier access. +//! +//! The ingestion code uses these structs to help with parsing the WAL records, +//! and it splits them into a stream of modifications to the key-value pairs that +//! are ultimately stored in delta layers. See also the split-out counterparts in +//! [`postgres_ffi::walrecord`]. +//! +//! The pipeline which processes WAL records is not super obvious, so let's follow +//! the flow of an example XACT_COMMIT Postgres record: +//! +//! (Postgres XACT_COMMIT record) +//! | +//! |--> pageserver::walingest::WalIngest::decode_xact_record +//! | +//! |--> ([`XactRecord::Commit`]) +//! | +//! |--> pageserver::walingest::WalIngest::ingest_xact_record +//! | +//! |--> (NeonWalRecord::ClogSetCommitted) +//! | +//! |--> write to KV store within the pageserver + +use bytes::Bytes; +use pageserver_api::reltag::{RelTag, SlruKind}; +use postgres_ffi::walrecord::{ + XlMultiXactCreate, XlMultiXactTruncate, XlRelmapUpdate, XlReploriginDrop, XlReploriginSet, + XlSmgrTruncate, XlXactParsedRecord, +}; +use postgres_ffi::{Oid, TransactionId}; +use utils::lsn::Lsn; + +pub enum HeapamRecord { + ClearVmBits(ClearVmBits), +} + +pub struct ClearVmBits { + pub new_heap_blkno: Option, + pub old_heap_blkno: Option, + pub vm_rel: RelTag, + pub flags: u8, +} + +pub enum NeonrmgrRecord { + ClearVmBits(ClearVmBits), +} + +pub enum SmgrRecord { + Create(SmgrCreate), + Truncate(XlSmgrTruncate), +} + +pub struct SmgrCreate { + pub rel: RelTag, +} + +pub enum DbaseRecord { + Create(DbaseCreate), + Drop(DbaseDrop), +} + +pub struct DbaseCreate { + pub db_id: Oid, + pub tablespace_id: Oid, + pub src_db_id: Oid, + pub src_tablespace_id: Oid, +} + +pub struct DbaseDrop { + pub db_id: Oid, + pub tablespace_ids: Vec, +} + +pub enum ClogRecord { + ZeroPage(ClogZeroPage), + Truncate(ClogTruncate), +} + +pub struct ClogZeroPage { + pub segno: u32, + pub rpageno: u32, +} + +pub struct ClogTruncate { + pub pageno: u32, + pub oldest_xid: TransactionId, + pub oldest_xid_db: Oid, +} + +pub enum XactRecord { + Commit(XactCommon), + Abort(XactCommon), + CommitPrepared(XactCommon), + AbortPrepared(XactCommon), + Prepare(XactPrepare), +} + +pub struct XactCommon { + pub parsed: XlXactParsedRecord, + pub origin_id: u16, + // Fields below are only used for logging + pub xl_xid: TransactionId, + pub lsn: Lsn, +} + +pub struct XactPrepare { + pub xl_xid: TransactionId, + pub data: Bytes, +} + +pub enum MultiXactRecord { + ZeroPage(MultiXactZeroPage), + Create(XlMultiXactCreate), + Truncate(XlMultiXactTruncate), +} + +pub struct MultiXactZeroPage { + pub slru_kind: SlruKind, + pub segno: u32, + pub rpageno: u32, +} + +pub enum RelmapRecord { + Update(RelmapUpdate), +} + +pub struct RelmapUpdate { + pub update: XlRelmapUpdate, + pub buf: Bytes, +} + +pub enum XlogRecord { + Raw(RawXlogRecord), +} + +pub struct RawXlogRecord { + pub info: u8, + pub lsn: Lsn, + pub buf: Bytes, +} + +pub enum LogicalMessageRecord { + Put(PutLogicalMessage), + #[cfg(feature = "testing")] + Failpoint, +} + +pub struct PutLogicalMessage { + pub path: String, + pub buf: Bytes, +} + +pub enum StandbyRecord { + RunningXacts(StandbyRunningXacts), +} + +pub struct StandbyRunningXacts { + pub oldest_running_xid: TransactionId, +} + +pub enum ReploriginRecord { + Set(XlReploriginSet), + Drop(XlReploriginDrop), +} diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 2531abc7a1..ecb8fa7491 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -8,7 +8,7 @@ license.workspace = true default = [] # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro, # which adds some runtime cost to run tests on outage conditions -testing = ["fail/failpoints", "pageserver_api/testing" ] +testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"] [dependencies] anyhow.workspace = true @@ -83,6 +83,7 @@ enum-map.workspace = true enumset = { workspace = true, features = ["serde"]} strum.workspace = true strum_macros.workspace = true +wal_decoder.workspace = true [target.'cfg(target_os = "linux")'.dependencies] procfs.workspace = true diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs index d98b23acce..0a1ad9cd6b 100644 --- a/pageserver/benches/bench_ingest.rs +++ b/pageserver/benches/bench_ingest.rs @@ -8,13 +8,12 @@ use pageserver::{ context::{DownloadBehavior, RequestContext}, l0_flush::{L0FlushConfig, L0FlushGlobalState}, page_cache, - repository::Value, task_mgr::TaskKind, tenant::storage_layer::inmemory_layer::SerializedBatch, tenant::storage_layer::InMemoryLayer, virtual_file, }; -use pageserver_api::{key::Key, shard::TenantShardId}; +use pageserver_api::{key::Key, shard::TenantShardId, value::Value}; use utils::{ bin_ser::BeSer, id::{TenantId, TimelineId}, diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs index 1353e79f7c..5c5b52db44 100644 --- a/pageserver/benches/bench_layer_map.rs +++ b/pageserver/benches/bench_layer_map.rs @@ -1,9 +1,9 @@ use criterion::measurement::WallTime; use pageserver::keyspace::{KeyPartitioning, KeySpace}; -use pageserver::repository::Key; use pageserver::tenant::layer_map::LayerMap; use pageserver::tenant::storage_layer::LayerName; use pageserver::tenant::storage_layer::PersistentLayerDesc; +use pageserver_api::key::Key; use pageserver_api::shard::TenantShardId; use rand::prelude::{SeedableRng, SliceRandom, StdRng}; use std::cmp::{max, min}; diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 45936cb3fa..d3551b56e1 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -60,7 +60,8 @@ use anyhow::Context; use bytes::{Buf, Bytes}; use criterion::{BenchmarkId, Criterion}; use once_cell::sync::Lazy; -use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager}; +use pageserver::{config::PageServerConf, walredo::PostgresRedoManager}; +use pageserver_api::record::NeonWalRecord; use pageserver_api::{key::Key, shard::TenantShardId}; use std::{ future::Future, diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs index bc939f9688..177e65ef79 100644 --- a/pageserver/ctl/src/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -51,7 +51,7 @@ //! use anyhow::{Context, Result}; -use pageserver::repository::Key; +use pageserver_api::key::Key; use std::cmp::Ordering; use std::io::{self, BufRead}; use std::path::PathBuf; diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index 7dd2a5d05c..451d2a1d69 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -14,12 +14,12 @@ use std::ops::Range; use std::{fs, str}; use pageserver::page_cache::{self, PAGE_SZ}; -use pageserver::repository::{Key, KEY_SIZE}; use pageserver::tenant::block_io::FileBlockReader; use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection}; use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE}; use pageserver::tenant::storage_layer::range_overlaps; use pageserver::virtual_file::{self, VirtualFile}; +use pageserver_api::key::{Key, KEY_SIZE}; use utils::{bin_ser::BeSer, lsn::Lsn}; diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index c0b2b6ae89..22627d72c8 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -14,13 +14,13 @@ use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use pageserver::virtual_file::api::IoMode; use pageserver::{page_cache, virtual_file}; use pageserver::{ - repository::{Key, KEY_SIZE}, tenant::{ block_io::FileBlockReader, disk_btree::VisitDirection, storage_layer::delta_layer::DELTA_KEY_SIZE, }, virtual_file::VirtualFile, }; +use pageserver_api::key::{Key, KEY_SIZE}; use std::fs; use utils::bin_ser::BeSer; use utils::id::{TenantId, TimelineId}; diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs index bce3285606..923a7f1f18 100644 --- a/pageserver/pagebench/src/cmd/aux_files.rs +++ b/pageserver/pagebench/src/cmd/aux_files.rs @@ -1,4 +1,4 @@ -use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest}; +use pageserver_api::models::{TenantConfig, TenantConfigRequest}; use pageserver_api::shard::TenantShardId; use utils::id::TenantTimelineId; use utils::lsn::Lsn; @@ -66,10 +66,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> { mgmt_api_client .tenant_config(&TenantConfigRequest { tenant_id: timeline.tenant_id, - config: TenantConfig { - switch_aux_file_policy: Some(AuxFilePolicy::V2), - ..Default::default() - }, + config: TenantConfig::default(), }) .await?; diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs index 73bdc90213..7733bdb640 100644 --- a/pageserver/src/deletion_queue.rs +++ b/pageserver/src/deletion_queue.rs @@ -696,7 +696,7 @@ impl DeletionQueue { mod test { use camino::Utf8Path; use hex_literal::hex; - use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant}; + use pageserver_api::{key::Key, shard::ShardIndex, upcall_api::ReAttachResponseTenant}; use std::{io::ErrorKind, time::Duration}; use tracing::info; @@ -705,7 +705,6 @@ mod test { use crate::{ controller_upcall_client::RetryForeverError, - repository::Key, tenant::{harness::TenantHarness, storage_layer::DeltaLayerName}, }; diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 3943f62ac0..2d8f4309ca 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2232,13 +2232,13 @@ async fn getpage_at_lsn_handler( check_permission(&request, Some(tenant_shard_id.tenant_id))?; let state = get_state(&request); - struct Key(crate::repository::Key); + struct Key(pageserver_api::key::Key); impl std::str::FromStr for Key { type Err = anyhow::Error; fn from_str(s: &str) -> std::result::Result { - crate::repository::Key::from_hex(s).map(Key) + pageserver_api::key::Key::from_hex(s).map(Key) } } diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs index ca87f1d080..530c91c4da 100644 --- a/pageserver/src/import_datadir.rs +++ b/pageserver/src/import_datadir.rs @@ -19,12 +19,11 @@ use crate::metrics::WAL_INGEST; use crate::pgdatadir_mapping::*; use crate::tenant::Timeline; use crate::walingest::WalIngest; -use crate::walrecord::decode_wal_record; -use crate::walrecord::DecodedWALRecord; use pageserver_api::reltag::{RelTag, SlruKind}; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::*; use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::walrecord::{decode_wal_record, DecodedWALRecord}; use postgres_ffi::ControlFileData; use postgres_ffi::DBState_DB_SHUTDOWNED; use postgres_ffi::Oid; diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index d51931c768..ef6711397a 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -24,7 +24,6 @@ pub mod metrics; pub mod page_cache; pub mod page_service; pub mod pgdatadir_mapping; -pub mod repository; pub mod span; pub(crate) mod statvfs; pub mod task_mgr; @@ -32,7 +31,6 @@ pub mod tenant; pub mod utilization; pub mod virtual_file; pub mod walingest; -pub mod walrecord; pub mod walredo; use camino::Utf8Path; diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 19233a28cc..dc2dc08b53 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -7,14 +7,14 @@ //! Clarify that) //! use super::tenant::{PageReconstructError, Timeline}; +use crate::aux_file; use crate::context::RequestContext; use crate::keyspace::{KeySpace, KeySpaceAccum}; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; -use crate::walrecord::NeonWalRecord; -use crate::{aux_file, repository::*}; use anyhow::{ensure, Context}; use bytes::{Buf, Bytes, BytesMut}; use enum_map::Enum; +use pageserver_api::key::Key; use pageserver_api::key::{ dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key, relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key, @@ -22,7 +22,9 @@ use pageserver_api::key::{ CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY, }; use pageserver_api::keyspace::SparseKeySpace; +use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; +use pageserver_api::value::Value; use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM}; use postgres_ffi::BLCKSZ; use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId}; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index f846e145c5..64e4eb46ce 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -92,11 +92,11 @@ use crate::metrics::{ remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, }; -use crate::repository::GcResult; use crate::task_mgr; use crate::task_mgr::TaskKind; use crate::tenant::config::LocationMode; use crate::tenant::config::TenantConfOpt; +use crate::tenant::gc_result::GcResult; pub use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::remote_initdb_archive_path; use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; @@ -160,6 +160,7 @@ pub(crate) mod timeline; pub mod size; mod gc_block; +mod gc_result; pub(crate) mod throttle; pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; @@ -301,6 +302,13 @@ pub struct Tenant { /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating` timelines_offloaded: Mutex>>, + /// Serialize writes of the tenant manifest to remote storage. If there are concurrent operations + /// affecting the manifest, such as timeline deletion and timeline offload, they must wait for + /// each other (this could be optimized to coalesce writes if necessary). + /// + /// The contents of the Mutex are the last manifest we successfully uploaded + tenant_manifest_upload: tokio::sync::Mutex>, + // This mutex prevents creation of new timelines during GC. // Adding yet another mutex (in addition to `timelines`) is needed because holding // `timelines` mutex during all GC iteration @@ -467,10 +475,10 @@ impl WalRedoManager { /// This method is cancellation-safe. pub async fn request_redo( &self, - key: crate::repository::Key, + key: pageserver_api::key::Key, lsn: Lsn, base_img: Option<(Lsn, bytes::Bytes)>, - records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>, + records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>, pg_version: u32, ) -> Result { match self { @@ -625,19 +633,10 @@ impl TimelineOrOffloaded { TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress, } } - fn remote_client_maybe_construct(&self, tenant: &Tenant) -> Arc { + fn maybe_remote_client(&self) -> Option> { match self { - TimelineOrOffloaded::Timeline(timeline) => timeline.remote_client.clone(), - TimelineOrOffloaded::Offloaded(offloaded) => match offloaded.remote_client.clone() { - Some(remote_client) => remote_client, - None => { - let remote_client = tenant.build_timeline_client( - offloaded.timeline_id, - tenant.remote_storage.clone(), - ); - Arc::new(remote_client) - } - }, + TimelineOrOffloaded::Timeline(timeline) => Some(timeline.remote_client.clone()), + TimelineOrOffloaded::Offloaded(offloaded) => offloaded.remote_client.clone(), } } } @@ -749,6 +748,24 @@ pub enum TimelineArchivalError { Other(anyhow::Error), } +#[derive(thiserror::Error, Debug)] +pub(crate) enum TenantManifestError { + #[error("Remote storage error: {0}")] + RemoteStorage(anyhow::Error), + + #[error("Cancelled")] + Cancelled, +} + +impl From for TimelineArchivalError { + fn from(e: TenantManifestError) -> Self { + match e { + TenantManifestError::RemoteStorage(e) => Self::Other(e), + TenantManifestError::Cancelled => Self::Cancelled, + } + } +} + impl Debug for TimelineArchivalError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -1534,18 +1551,7 @@ impl Tenant { offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter()); } if !offloaded_timeline_ids.is_empty() { - let manifest = self.tenant_manifest(); - // TODO: generation support - let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION; - upload_tenant_manifest( - &self.remote_storage, - &self.tenant_shard_id, - generation, - &manifest, - &self.cancel, - ) - .await - .map_err(TimelineArchivalError::Other)?; + self.store_tenant_manifest().await?; } // The local filesystem contents are a cache of what's in the remote IndexPart; @@ -1830,6 +1836,18 @@ impl Tenant { ctx: RequestContext, ) -> Result, TimelineArchivalError> { info!("unoffloading timeline"); + + // We activate the timeline below manually, so this must be called on an active timeline. + // We expect callers of this function to ensure this. + match self.current_state() { + TenantState::Activating { .. } + | TenantState::Attaching + | TenantState::Broken { .. } => { + panic!("Timeline expected to be active") + } + TenantState::Stopping { .. } => return Err(TimelineArchivalError::Cancelled), + TenantState::Active => {} + } let cancel = self.cancel.clone(); // Protect against concurrent attempts to use this TimelineId @@ -1914,18 +1932,7 @@ impl Tenant { }; // Upload new list of offloaded timelines to S3 - let manifest = self.tenant_manifest(); - // TODO: generation support - let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION; - upload_tenant_manifest( - &self.remote_storage, - &self.tenant_shard_id, - generation, - &manifest, - &cancel, - ) - .await - .map_err(TimelineArchivalError::Other)?; + self.store_tenant_manifest().await?; // Activate the timeline (if it makes sense) if !(timeline.is_broken() || timeline.is_stopping()) { @@ -3122,7 +3129,7 @@ impl Tenant { } } - let tenant_manifest = self.tenant_manifest(); + let tenant_manifest = self.build_tenant_manifest(); // TODO: generation support let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION; for child_shard in child_shards { @@ -3317,7 +3324,8 @@ impl Tenant { .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length) } - pub(crate) fn tenant_manifest(&self) -> TenantManifest { + /// Generate an up-to-date TenantManifest based on the state of this Tenant. + fn build_tenant_manifest(&self) -> TenantManifest { let timelines_offloaded = self.timelines_offloaded.lock().unwrap(); let mut timeline_manifests = timelines_offloaded @@ -3525,6 +3533,7 @@ impl Tenant { timelines: Mutex::new(HashMap::new()), timelines_creating: Mutex::new(HashSet::new()), timelines_offloaded: Mutex::new(HashMap::new()), + tenant_manifest_upload: Default::default(), gc_cs: tokio::sync::Mutex::new(()), walredo_mgr, remote_storage, @@ -4704,6 +4713,49 @@ impl Tenant { .max() .unwrap_or(0) } + + /// Serialize and write the latest TenantManifest to remote storage. + pub(crate) async fn store_tenant_manifest(&self) -> Result<(), TenantManifestError> { + // Only one manifest write may be done at at time, and the contents of the manifest + // must be loaded while holding this lock. This makes it safe to call this function + // from anywhere without worrying about colliding updates. + let mut guard = tokio::select! { + g = self.tenant_manifest_upload.lock() => { + g + }, + _ = self.cancel.cancelled() => { + return Err(TenantManifestError::Cancelled); + } + }; + + let manifest = self.build_tenant_manifest(); + if Some(&manifest) == (*guard).as_ref() { + // Optimisation: skip uploads that don't change anything. + return Ok(()); + } + + upload_tenant_manifest( + &self.remote_storage, + &self.tenant_shard_id, + self.generation, + &manifest, + &self.cancel, + ) + .await + .map_err(|e| { + if self.cancel.is_cancelled() { + TenantManifestError::Cancelled + } else { + TenantManifestError::RemoteStorage(e) + } + })?; + + // Store the successfully uploaded manifest, so that future callers can avoid + // re-uploading the same thing. + *guard = Some(manifest); + + Ok(()) + } } /// Create the cluster temporarily in 'initdbpath' directory inside the repository @@ -4806,7 +4858,8 @@ pub(crate) mod harness { use crate::deletion_queue::mock::MockDeletionQueue; use crate::l0_flush::L0FlushConfig; use crate::walredo::apply_neon; - use crate::{repository::Key, walrecord::NeonWalRecord}; + use pageserver_api::key::Key; + use pageserver_api::record::NeonWalRecord; use super::*; use hex_literal::hex; @@ -4853,7 +4906,6 @@ pub(crate) mod harness { image_layer_creation_check_threshold: Some( tenant_conf.image_layer_creation_check_threshold, ), - switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy), lsn_lease_length: Some(tenant_conf.lsn_lease_length), lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts), } @@ -5076,25 +5128,30 @@ mod tests { use super::*; use crate::keyspace::KeySpaceAccum; - use crate::repository::{Key, Value}; use crate::tenant::harness::*; use crate::tenant::timeline::CompactFlags; - use crate::walrecord::NeonWalRecord; use crate::DEFAULT_PG_VERSION; use bytes::{Bytes, BytesMut}; use hex_literal::hex; use itertools::Itertools; - use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE}; + use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; + use pageserver_api::value::Value; use rand::{thread_rng, Rng}; use storage_layer::PersistentLayerKey; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; - use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; - use timeline::{DeltaLayerTestDesc, GcInfo}; + use timeline::DeltaLayerTestDesc; use utils::id::TenantId; + #[cfg(feature = "testing")] + use pageserver_api::record::NeonWalRecord; + #[cfg(feature = "testing")] + use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn}; + #[cfg(feature = "testing")] + use timeline::GcInfo; + static TEST_KEY: Lazy = Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001"))); @@ -7659,6 +7716,7 @@ mod tests { Ok(()) } + #[cfg(feature = "testing")] #[tokio::test] async fn test_neon_test_record() -> anyhow::Result<()> { let harness = TenantHarness::create("test_neon_test_record").await?; @@ -7850,6 +7908,7 @@ mod tests { Ok(()) } + #[cfg(feature = "testing")] #[tokio::test] async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> { let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?; @@ -8046,6 +8105,7 @@ mod tests { Ok(()) } + #[cfg(feature = "testing")] #[tokio::test] async fn test_generate_key_retention() -> anyhow::Result<()> { let harness = TenantHarness::create("test_generate_key_retention").await?; @@ -8393,6 +8453,7 @@ mod tests { Ok(()) } + #[cfg(feature = "testing")] #[tokio::test] async fn test_simple_bottom_most_compaction_with_retain_lsns() -> anyhow::Result<()> { let harness = @@ -8633,6 +8694,7 @@ mod tests { Ok(()) } + #[cfg(feature = "testing")] #[tokio::test] async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()> { @@ -8841,6 +8903,7 @@ mod tests { Ok(()) } + #[cfg(feature = "testing")] #[tokio::test] async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> { let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?; @@ -9042,6 +9105,7 @@ mod tests { // // When querying the key range [A, B) we need to read at different LSN ranges // for [A, C) and [C, B). This test checks that the described edge case is handled correctly. + #[cfg(feature = "testing")] #[tokio::test] async fn test_vectored_read_with_nested_image_layer() -> anyhow::Result<()> { let harness = TenantHarness::create("test_vectored_read_with_nested_image_layer").await?; diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs index 502cb62fe8..ce686c89ef 100644 --- a/pageserver/src/tenant/config.rs +++ b/pageserver/src/tenant/config.rs @@ -9,7 +9,6 @@ //! may lead to a data loss. //! pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf; -use pageserver_api::models::AuxFilePolicy; use pageserver_api::models::CompactionAlgorithmSettings; use pageserver_api::models::EvictionPolicy; use pageserver_api::models::{self, ThrottleConfig}; @@ -341,10 +340,6 @@ pub struct TenantConfOpt { #[serde(skip_serializing_if = "Option::is_none")] pub image_layer_creation_check_threshold: Option, - #[serde(skip_serializing_if = "Option::is_none")] - #[serde(default)] - pub switch_aux_file_policy: Option, - #[serde(skip_serializing_if = "Option::is_none")] #[serde(with = "humantime_serde")] #[serde(default)] @@ -410,9 +405,6 @@ impl TenantConfOpt { image_layer_creation_check_threshold: self .image_layer_creation_check_threshold .unwrap_or(global_conf.image_layer_creation_check_threshold), - switch_aux_file_policy: self - .switch_aux_file_policy - .unwrap_or(global_conf.switch_aux_file_policy), lsn_lease_length: self .lsn_lease_length .unwrap_or(global_conf.lsn_lease_length), @@ -470,7 +462,6 @@ impl From for models::TenantConfig { lazy_slru_download: value.lazy_slru_download, timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from), image_layer_creation_check_threshold: value.image_layer_creation_check_threshold, - switch_aux_file_policy: value.switch_aux_file_policy, lsn_lease_length: value.lsn_lease_length.map(humantime), lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime), } diff --git a/pageserver/src/tenant/gc_result.rs b/pageserver/src/tenant/gc_result.rs new file mode 100644 index 0000000000..c805aafeab --- /dev/null +++ b/pageserver/src/tenant/gc_result.rs @@ -0,0 +1,57 @@ +use anyhow::Result; +use serde::Serialize; +use std::ops::AddAssign; +use std::time::Duration; + +/// +/// Result of performing GC +/// +#[derive(Default, Serialize, Debug)] +pub struct GcResult { + pub layers_total: u64, + pub layers_needed_by_cutoff: u64, + pub layers_needed_by_pitr: u64, + pub layers_needed_by_branches: u64, + pub layers_needed_by_leases: u64, + pub layers_not_updated: u64, + pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files. + + #[serde(serialize_with = "serialize_duration_as_millis")] + pub elapsed: Duration, + + /// The layers which were garbage collected. + /// + /// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be + /// dropped in tests. + #[cfg(feature = "testing")] + #[serde(skip)] + pub(crate) doomed_layers: Vec, +} + +// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds +fn serialize_duration_as_millis(d: &Duration, serializer: S) -> Result +where + S: serde::Serializer, +{ + d.as_millis().serialize(serializer) +} + +impl AddAssign for GcResult { + fn add_assign(&mut self, other: Self) { + self.layers_total += other.layers_total; + self.layers_needed_by_pitr += other.layers_needed_by_pitr; + self.layers_needed_by_cutoff += other.layers_needed_by_cutoff; + self.layers_needed_by_branches += other.layers_needed_by_branches; + self.layers_needed_by_leases += other.layers_needed_by_leases; + self.layers_not_updated += other.layers_not_updated; + self.layers_removed += other.layers_removed; + + self.elapsed += other.elapsed; + + #[cfg(feature = "testing")] + { + let mut other = other; + self.doomed_layers.append(&mut other.doomed_layers); + } + } +} diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 707233b003..7f15baed10 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -48,9 +48,9 @@ mod layer_coverage; use crate::context::RequestContext; use crate::keyspace::KeyPartitioning; -use crate::repository::Key; use crate::tenant::storage_layer::InMemoryLayer; use anyhow::Result; +use pageserver_api::key::Key; use pageserver_api::keyspace::{KeySpace, KeySpaceAccum}; use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze}; use std::collections::{HashMap, VecDeque}; diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 0567f8f3a7..a4c458b737 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -2811,7 +2811,7 @@ where } use { - crate::repository::GcResult, pageserver_api::models::TimelineGcRequest, + crate::tenant::gc_result::GcResult, pageserver_api::models::TimelineGcRequest, utils::http::error::ApiError, }; diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 1c72c7fff8..19e762b9fa 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -249,7 +249,7 @@ pub(crate) use download::{ list_remote_tenant_shards, list_remote_timelines, }; pub(crate) use index::LayerFileMetadata; -pub(crate) use upload::{upload_initdb_dir, upload_tenant_manifest}; +pub(crate) use upload::upload_initdb_dir; // Occasional network issues and such can cause remote operations to fail, and // that's expected. If a download fails, we log it at info-level, and retry. diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 95f8f026d4..8679c68a27 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -403,59 +403,79 @@ async fn do_download_index_part( Ok((index_part, index_generation, index_part_mtime)) } -/// index_part.json objects are suffixed with a generation number, so we cannot -/// directly GET the latest index part without doing some probing. +/// Metadata objects are "generationed", meaning that they include a generation suffix. This +/// function downloads the object with the highest generation <= `my_generation`. /// -/// In this function we probe for the most recent index in a generation <= our current generation. -/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md +/// Data objects (layer files) also include a generation in their path, but there is no equivalent +/// search process, because their reference from an index includes the generation. +/// +/// An expensive object listing operation is only done if necessary: the typical fast path is to issue two +/// GET operations, one to our own generation (stale attachment case), and one to the immediately preceding +/// generation (normal case when migrating/restarting). Only if both of these return 404 do we fall back +/// to listing objects. +/// +/// * `my_generation`: the value of `[crate::tenant::Tenant::generation]` +/// * `what`: for logging, what object are we downloading +/// * `prefix`: when listing objects, use this prefix (i.e. the part of the object path before the generation) +/// * `do_download`: a GET of the object in a particular generation, which should **retry indefinitely** unless +/// `cancel`` has fired. This function does not do its own retries of GET operations, and relies +/// on the function passed in to do so. +/// * `parse_path`: parse a fully qualified remote storage path to get the generation of the object. +#[allow(clippy::too_many_arguments)] #[tracing::instrument(skip_all, fields(generation=?my_generation))] -pub(crate) async fn download_index_part( - storage: &GenericRemoteStorage, - tenant_shard_id: &TenantShardId, - timeline_id: &TimelineId, +pub(crate) async fn download_generation_object<'a, T, DF, DFF, PF>( + storage: &'a GenericRemoteStorage, + tenant_shard_id: &'a TenantShardId, + timeline_id: &'a TimelineId, my_generation: Generation, - cancel: &CancellationToken, -) -> Result<(IndexPart, Generation, SystemTime), DownloadError> { + what: &str, + prefix: RemotePath, + do_download: DF, + parse_path: PF, + cancel: &'a CancellationToken, +) -> Result<(T, Generation, SystemTime), DownloadError> +where + DF: Fn( + &'a GenericRemoteStorage, + &'a TenantShardId, + &'a TimelineId, + Generation, + &'a CancellationToken, + ) -> DFF, + DFF: Future>, + PF: Fn(RemotePath) -> Option, + T: 'static, +{ debug_assert_current_span_has_tenant_and_timeline_id(); if my_generation.is_none() { // Operating without generations: just fetch the generation-less path - return do_download_index_part( - storage, - tenant_shard_id, - timeline_id, - my_generation, - cancel, - ) - .await; + return do_download(storage, tenant_shard_id, timeline_id, my_generation, cancel).await; } - // Stale case: If we were intentionally attached in a stale generation, there may already be a remote - // index in our generation. + // Stale case: If we were intentionally attached in a stale generation, the remote object may already + // exist in our generation. // // This is an optimization to avoid doing the listing for the general case below. - let res = - do_download_index_part(storage, tenant_shard_id, timeline_id, my_generation, cancel).await; + let res = do_download(storage, tenant_shard_id, timeline_id, my_generation, cancel).await; match res { - Ok(index_part) => { - tracing::debug!( - "Found index_part from current generation (this is a stale attachment)" - ); - return Ok(index_part); + Ok(decoded) => { + tracing::debug!("Found {what} from current generation (this is a stale attachment)"); + return Ok(decoded); } Err(DownloadError::NotFound) => {} Err(e) => return Err(e), }; - // Typical case: the previous generation of this tenant was running healthily, and had uploaded - // and index part. We may safely start from this index without doing a listing, because: + // Typical case: the previous generation of this tenant was running healthily, and had uploaded the object + // we are seeking in that generation. We may safely start from this index without doing a listing, because: // - We checked for current generation case above // - generations > my_generation are to be ignored - // - any other indices that exist would have an older generation than `previous_gen`, and - // we want to find the most recent index from a previous generation. + // - any other objects that exist would have an older generation than `previous_gen`, and + // we want to find the most recent object from a previous generation. // // This is an optimization to avoid doing the listing for the general case below. - let res = do_download_index_part( + let res = do_download( storage, tenant_shard_id, timeline_id, @@ -464,14 +484,12 @@ pub(crate) async fn download_index_part( ) .await; match res { - Ok(index_part) => { - tracing::debug!("Found index_part from previous generation"); - return Ok(index_part); + Ok(decoded) => { + tracing::debug!("Found {what} from previous generation"); + return Ok(decoded); } Err(DownloadError::NotFound) => { - tracing::debug!( - "No index_part found from previous generation, falling back to listing" - ); + tracing::debug!("No {what} found from previous generation, falling back to listing"); } Err(e) => { return Err(e); @@ -481,12 +499,10 @@ pub(crate) async fn download_index_part( // General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json // objects, and select the highest one with a generation <= my_generation. Constructing the prefix is equivalent // to constructing a full index path with no generation, because the generation is a suffix. - let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none()); - - let indices = download_retry( + let paths = download_retry( || async { storage - .list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel) + .list(Some(&prefix), ListingMode::NoDelimiter, None, cancel) .await }, "list index_part files", @@ -497,22 +513,22 @@ pub(crate) async fn download_index_part( // General case logic for which index to use: the latest index whose generation // is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md - let max_previous_generation = indices + let max_previous_generation = paths .into_iter() - .filter_map(|o| parse_remote_index_path(o.key)) + .filter_map(|o| parse_path(o.key)) .filter(|g| g <= &my_generation) .max(); match max_previous_generation { Some(g) => { - tracing::debug!("Found index_part in generation {g:?}"); - do_download_index_part(storage, tenant_shard_id, timeline_id, g, cancel).await + tracing::debug!("Found {what} in generation {g:?}"); + do_download(storage, tenant_shard_id, timeline_id, g, cancel).await } None => { // Migration from legacy pre-generation state: we have a generation but no prior // attached pageservers did. Try to load from a no-generation path. - tracing::debug!("No index_part.json* found"); - do_download_index_part( + tracing::debug!("No {what}* found"); + do_download( storage, tenant_shard_id, timeline_id, @@ -524,6 +540,33 @@ pub(crate) async fn download_index_part( } } +/// index_part.json objects are suffixed with a generation number, so we cannot +/// directly GET the latest index part without doing some probing. +/// +/// In this function we probe for the most recent index in a generation <= our current generation. +/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md +pub(crate) async fn download_index_part( + storage: &GenericRemoteStorage, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + my_generation: Generation, + cancel: &CancellationToken, +) -> Result<(IndexPart, Generation, SystemTime), DownloadError> { + let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none()); + download_generation_object( + storage, + tenant_shard_id, + timeline_id, + my_generation, + "index_part", + index_prefix, + do_download_index_part, + parse_remote_index_path, + cancel, + ) + .await +} + pub(crate) async fn download_initdb_tar_zst( conf: &'static PageServerConf, storage: &GenericRemoteStorage, diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs index 7d92d45146..c4382cb648 100644 --- a/pageserver/src/tenant/remote_timeline_client/manifest.rs +++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize}; use utils::{id::TimelineId, lsn::Lsn}; /// Tenant-shard scoped manifest -#[derive(Clone, Serialize, Deserialize)] +#[derive(Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct TenantManifest { /// Debugging aid describing the version of this manifest. /// Can also be used for distinguishing breaking changes later on. @@ -23,7 +23,7 @@ pub struct TenantManifest { /// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`], /// but the two datastructures serve different needs, this is for a persistent disk format /// that must be backwards compatible, while the other is only for informative purposes. -#[derive(Clone, Serialize, Deserialize, Copy)] +#[derive(Clone, Serialize, Deserialize, Copy, PartialEq, Eq)] pub struct OffloadedTimelineManifest { pub timeline_id: TimelineId, /// Whether the timeline has a parent it has been branched off from or not diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index 4a4c698b56..6c3276ea3c 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -187,6 +187,8 @@ pub(super) async fn gather_inputs( // but it is unlikely to cause any issues. In the worst case, // the calculation will error out. timelines.retain(|t| t.is_active()); + // Also filter out archived timelines. + timelines.retain(|t| t.is_archived() != Some(true)); // Build a map of branch points. let mut branchpoints: HashMap> = HashMap::new(); diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 4a63491e90..8f4219bbbc 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -11,11 +11,11 @@ mod layer_name; pub mod merge_iterator; use crate::context::{AccessStatsBehavior, RequestContext}; -use crate::repository::Value; -use crate::walrecord::NeonWalRecord; use bytes::Bytes; use pageserver_api::key::Key; use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum}; +use pageserver_api::record::NeonWalRecord; +use pageserver_api::value::Value; use std::cmp::{Ordering, Reverse}; use std::collections::hash_map::Entry; use std::collections::{BinaryHeap, HashMap}; diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs index 272e422c90..8a397ceb7a 100644 --- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs +++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs @@ -5,7 +5,8 @@ use pageserver_api::key::{Key, KEY_SIZE}; use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId}; use crate::tenant::storage_layer::Layer; -use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline}; +use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline}; +use pageserver_api::value::Value; use super::layer::S3_UPLOAD_LIMIT; use super::{ diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 641729d681..10165b1d06 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -30,7 +30,6 @@ use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::page_cache::{self, FileId, PAGE_SZ}; -use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader}; use crate::tenant::disk_btree::{ @@ -46,7 +45,7 @@ use crate::tenant::PageReconstructError; use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt}; use crate::virtual_file::IoBufferMut; use crate::virtual_file::{self, MaybeFatalIo, VirtualFile}; -use crate::{walrecord, TEMP_FILE_SUFFIX}; +use crate::TEMP_FILE_SUFFIX; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{anyhow, bail, ensure, Context, Result}; use camino::{Utf8Path, Utf8PathBuf}; @@ -54,9 +53,11 @@ use futures::StreamExt; use itertools::Itertools; use pageserver_api::config::MaxVectoredReadBytes; use pageserver_api::key::DBDIR_KEY; +use pageserver_api::key::{Key, KEY_SIZE}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::shard::TenantShardId; +use pageserver_api::value::Value; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; use std::collections::VecDeque; @@ -1293,7 +1294,7 @@ impl DeltaLayerInner { // is it an image or will_init walrecord? // FIXME: this could be handled by threading the BlobRef to the // VectoredReadBuilder - let will_init = crate::repository::ValueBytes::will_init(&data) + let will_init = pageserver_api::value::ValueBytes::will_init(&data) .inspect_err(|_e| { #[cfg(feature = "testing")] tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value"); @@ -1356,7 +1357,7 @@ impl DeltaLayerInner { format!(" img {} bytes", img.len()) } Value::WalRecord(rec) => { - let wal_desc = walrecord::describe_wal_record(&rec)?; + let wal_desc = pageserver_api::record::describe_wal_record(&rec)?; format!( " rec {} bytes will_init: {} {}", buf.len(), @@ -1610,7 +1611,6 @@ pub(crate) mod test { use rand::RngCore; use super::*; - use crate::repository::Value; use crate::tenant::harness::TIMELINE_ID; use crate::tenant::storage_layer::{Layer, ResidentLayer}; use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; @@ -1622,6 +1622,7 @@ pub(crate) mod test { DEFAULT_PG_VERSION, }; use bytes::Bytes; + use pageserver_api::value::Value; /// Construct an index for a fictional delta layer and and then /// traverse in order to plan vectored reads for a query. Finally, @@ -1974,8 +1975,8 @@ pub(crate) mod test { #[tokio::test] async fn copy_delta_prefix_smoke() { - use crate::walrecord::NeonWalRecord; use bytes::Bytes; + use pageserver_api::record::NeonWalRecord; let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke") .await @@ -2198,6 +2199,7 @@ pub(crate) mod test { (k1, l1).cmp(&(k2, l2)) } + #[cfg(feature = "testing")] pub(crate) fn sort_delta_value( (k1, l1, v1): &(Key, Lsn, Value), (k2, l2, v2): &(Key, Lsn, Value), diff --git a/pageserver/src/tenant/storage_layer/filter_iterator.rs b/pageserver/src/tenant/storage_layer/filter_iterator.rs index f45dd4b801..ccfcf68e8f 100644 --- a/pageserver/src/tenant/storage_layer/filter_iterator.rs +++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs @@ -7,7 +7,7 @@ use pageserver_api::{ }; use utils::lsn::Lsn; -use crate::repository::Value; +use pageserver_api::value::Value; use super::merge_iterator::MergeIterator; @@ -121,8 +121,8 @@ mod tests { #[tokio::test] async fn filter_keyspace_iterator() { - use crate::repository::Value; use bytes::Bytes; + use pageserver_api::value::Value; let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator") .await diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 3f90df312d..c0d183dc08 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -28,7 +28,6 @@ use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; use crate::page_cache::{self, FileId, PAGE_SZ}; -use crate::repository::{Key, Value, KEY_SIZE}; use crate::tenant::blob_io::BlobWriter; use crate::tenant::block_io::{BlockBuf, FileBlockReader}; use crate::tenant::disk_btree::{ @@ -51,8 +50,10 @@ use hex; use itertools::Itertools; use pageserver_api::config::MaxVectoredReadBytes; use pageserver_api::key::DBDIR_KEY; +use pageserver_api::key::{Key, KEY_SIZE}; use pageserver_api::keyspace::KeySpace; use pageserver_api::shard::{ShardIdentity, TenantShardId}; +use pageserver_api::value::Value; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; use std::collections::VecDeque; @@ -1125,6 +1126,7 @@ mod test { use pageserver_api::{ key::Key, shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize}, + value::Value, }; use utils::{ generation::Generation, @@ -1134,7 +1136,6 @@ mod test { use crate::{ context::RequestContext, - repository::Value, tenant::{ config::TenantConf, harness::{TenantHarness, TIMELINE_ID}, diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 7573ddb5cc..df448a0963 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -7,7 +7,6 @@ use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64}; use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; -use crate::repository::{Key, Value}; use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::timeline::GetVectoredError; use crate::tenant::PageReconstructError; @@ -16,9 +15,11 @@ use crate::{l0_flush, page_cache}; use anyhow::{anyhow, Context, Result}; use camino::Utf8PathBuf; use pageserver_api::key::CompactKey; +use pageserver_api::key::Key; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; use pageserver_api::shard::TenantShardId; +use pageserver_api::value::Value; use std::collections::{BTreeMap, HashMap}; use std::sync::{Arc, OnceLock}; use std::time::Instant; diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs index 9de70f14ee..36dcc8d805 100644 --- a/pageserver/src/tenant/storage_layer/layer/tests.rs +++ b/pageserver/src/tenant/storage_layer/layer/tests.rs @@ -760,8 +760,8 @@ async fn evict_and_wait_does_not_wait_for_download() { /// Also checks that the same does not happen on a non-evicted layer (regression test). #[tokio::test(start_paused = true)] async fn eviction_cancellation_on_drop() { - use crate::repository::Value; use bytes::Bytes; + use pageserver_api::value::Value; // this is the runtime on which Layer spawns the blocking tasks on let handle = tokio::runtime::Handle::current(); @@ -782,7 +782,7 @@ async fn eviction_cancellation_on_drop() { let mut writer = timeline.writer().await; writer .put( - crate::repository::Key::from_i128(5), + pageserver_api::key::Key::from_i128(5), Lsn(0x20), &Value::Image(Bytes::from_static(b"this does not matter either")), &ctx, diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs index a30c25d780..2097e90764 100644 --- a/pageserver/src/tenant/storage_layer/layer_desc.rs +++ b/pageserver/src/tenant/storage_layer/layer_desc.rs @@ -3,7 +3,7 @@ use pageserver_api::shard::TenantShardId; use std::ops::Range; use utils::{id::TimelineId, lsn::Lsn}; -use crate::repository::Key; +use pageserver_api::key::Key; use super::{DeltaLayerName, ImageLayerName, LayerName}; diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs index 8e750e1187..2b98d74f9f 100644 --- a/pageserver/src/tenant/storage_layer/layer_name.rs +++ b/pageserver/src/tenant/storage_layer/layer_name.rs @@ -1,7 +1,7 @@ //! //! Helper functions for dealing with filenames of the image and delta layer files. //! -use crate::repository::Key; +use pageserver_api::key::Key; use std::borrow::Cow; use std::cmp::Ordering; use std::fmt; diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs index f91e27241d..980202f12c 100644 --- a/pageserver/src/tenant/storage_layer/merge_iterator.rs +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -7,7 +7,8 @@ use anyhow::bail; use pageserver_api::key::Key; use utils::lsn::Lsn; -use crate::{context::RequestContext, repository::Value}; +use crate::context::RequestContext; +use pageserver_api::value::Value; use super::{ delta_layer::{DeltaLayerInner, DeltaLayerIterator}, @@ -291,12 +292,16 @@ mod tests { use crate::{ tenant::{ harness::{TenantHarness, TIMELINE_ID}, - storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value}, + storage_layer::delta_layer::test::{produce_delta_layer, sort_delta}, }, - walrecord::NeonWalRecord, DEFAULT_PG_VERSION, }; + #[cfg(feature = "testing")] + use crate::tenant::storage_layer::delta_layer::test::sort_delta_value; + #[cfg(feature = "testing")] + use pageserver_api::record::NeonWalRecord; + async fn assert_merge_iter_equal( merge_iter: &mut MergeIterator<'_>, expect: &[(Key, Lsn, Value)], @@ -319,8 +324,8 @@ mod tests { #[tokio::test] async fn merge_in_between() { - use crate::repository::Value; use bytes::Bytes; + use pageserver_api::value::Value; let harness = TenantHarness::create("merge_iterator_merge_in_between") .await @@ -384,8 +389,8 @@ mod tests { #[tokio::test] async fn delta_merge() { - use crate::repository::Value; use bytes::Bytes; + use pageserver_api::value::Value; let harness = TenantHarness::create("merge_iterator_delta_merge") .await @@ -458,10 +463,11 @@ mod tests { // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge } + #[cfg(feature = "testing")] #[tokio::test] async fn delta_image_mixed_merge() { - use crate::repository::Value; use bytes::Bytes; + use pageserver_api::value::Value; let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge") .await @@ -586,5 +592,6 @@ mod tests { is_send(merge_iter); } + #[cfg(feature = "testing")] fn is_send(_: impl Send) {} } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index f8d61dac5e..d765a7c987 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -125,11 +125,12 @@ use utils::{ simple_rcu::{Rcu, RcuReadGuard}, }; -use crate::repository::GcResult; -use crate::repository::{Key, Value}; use crate::task_mgr; use crate::task_mgr::TaskKind; +use crate::tenant::gc_result::GcResult; use crate::ZERO_PAGE; +use pageserver_api::key::Key; +use pageserver_api::value::Value; use self::delete::DeleteTimelineFlow; pub(super) use self::eviction_task::EvictionTaskTenantState; @@ -5822,17 +5823,15 @@ fn is_send() { #[cfg(test)] mod tests { use pageserver_api::key::Key; + use pageserver_api::value::Value; use utils::{id::TimelineId, lsn::Lsn}; - use crate::{ - repository::Value, - tenant::{ - harness::{test_img, TenantHarness}, - layer_map::LayerMap, - storage_layer::{Layer, LayerName}, - timeline::{DeltaLayerTestDesc, EvictionError}, - Timeline, - }, + use crate::tenant::{ + harness::{test_img, TenantHarness}, + layer_map::LayerMap, + storage_layer::{Layer, LayerName}, + timeline::{DeltaLayerTestDesc, EvictionError}, + Timeline, }; #[tokio::test] diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 73e4f0e87c..70f93656cd 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -49,9 +49,10 @@ use pageserver_api::config::tenant_conf_defaults::{ DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD, }; -use crate::keyspace::KeySpace; -use crate::repository::{Key, Value}; -use crate::walrecord::NeonWalRecord; +use pageserver_api::key::Key; +use pageserver_api::keyspace::KeySpace; +use pageserver_api::record::NeonWalRecord; +use pageserver_api::value::Value; use utils::lsn::Lsn; @@ -2148,7 +2149,7 @@ struct ResidentDeltaLayer(ResidentLayer); struct ResidentImageLayer(ResidentLayer); impl CompactionJobExecutor for TimelineAdaptor { - type Key = crate::repository::Key; + type Key = pageserver_api::key::Key; type Layer = OwnArc; type DeltaLayer = ResidentDeltaLayer; diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index a664bb59e1..2c6161da15 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -6,7 +6,7 @@ use std::{ use anyhow::Context; use pageserver_api::{models::TimelineState, shard::TenantShardId}; use tokio::sync::OwnedMutexGuard; -use tracing::{error, info, instrument, Instrument}; +use tracing::{error, info, info_span, instrument, Instrument}; use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint}; use crate::{ @@ -14,10 +14,9 @@ use crate::{ task_mgr::{self, TaskKind}, tenant::{ metadata::TimelineMetadata, - remote_timeline_client::{ - self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient, - }, - CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded, + remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient}, + CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant, + TimelineOrOffloaded, }, }; @@ -176,32 +175,6 @@ async fn remove_maybe_offloaded_timeline_from_tenant( Ok(()) } -/// It is important that this gets called when DeletionGuard is being held. -/// For more context see comments in [`DeleteTimelineFlow::prepare`] -async fn upload_new_tenant_manifest( - tenant: &Tenant, - _: &DeletionGuard, // using it as a witness -) -> anyhow::Result<()> { - // This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash - // between the deletion of the index-part.json and reaching of this code. - // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted. - // However, we handle this case in tenant loading code so the next time we attach, the issue is - // resolved. - let manifest = tenant.tenant_manifest(); - // TODO: generation support - let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION; - remote_timeline_client::upload_tenant_manifest( - &tenant.remote_storage, - &tenant.tenant_shard_id, - generation, - &manifest, - &tenant.cancel, - ) - .await?; - - Ok(()) -} - /// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures, /// and deletes its data from both disk and s3. /// The sequence of steps: @@ -258,7 +231,32 @@ impl DeleteTimelineFlow { ))? }); - let remote_client = timeline.remote_client_maybe_construct(tenant); + let remote_client = match timeline.maybe_remote_client() { + Some(remote_client) => remote_client, + None => { + let remote_client = tenant + .build_timeline_client(timeline.timeline_id(), tenant.remote_storage.clone()); + let result = remote_client + .download_index_file(&tenant.cancel) + .instrument(info_span!("download_index_file")) + .await + .map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!("error: {:?}", e)))?; + let index_part = match result { + MaybeDeletedIndexPart::Deleted(p) => { + tracing::info!("Timeline already set as deleted in remote index"); + p + } + MaybeDeletedIndexPart::IndexPart(p) => p, + }; + let remote_client = Arc::new(remote_client); + + remote_client + .init_upload_queue(&index_part) + .map_err(DeleteTimelineError::Other)?; + remote_client.shutdown().await; + remote_client + } + }; set_deleted_in_remote_index(&remote_client).await?; fail::fail_point!("timeline-delete-before-schedule", |_| { @@ -455,7 +453,15 @@ impl DeleteTimelineFlow { remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?; - upload_new_tenant_manifest(tenant, &guard).await?; + // This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash + // between the deletion of the index-part.json and reaching of this code. + // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted. + // However, we handle this case in tenant loading code so the next time we attach, the issue is + // resolved. + tenant + .store_tenant_manifest() + .await + .map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!(e)))?; *guard = Self::Finished; diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 8e6eceb084..305c139b54 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard}; use super::Timeline; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; -use crate::tenant::{remote_timeline_client, OffloadedTimeline, Tenant, TimelineOrOffloaded}; +use crate::tenant::{OffloadedTimeline, Tenant, TimelineOrOffloaded}; pub(crate) async fn offload_timeline( tenant: &Tenant, @@ -63,17 +63,10 @@ pub(crate) async fn offload_timeline( // at the next restart attach it again. // For that to happen, we'd need to make the manifest reflect our *intended* state, // not our actual state of offloaded timelines. - let manifest = tenant.tenant_manifest(); - // TODO: generation support - let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION; - remote_timeline_client::upload_tenant_manifest( - &tenant.remote_storage, - &tenant.tenant_shard_id, - generation, - &manifest, - &tenant.cancel, - ) - .await?; + tenant + .store_tenant_manifest() + .await + .map_err(|e| anyhow::anyhow!(e))?; Ok(()) } diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index cee259e2e0..739fadbc6b 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -31,11 +31,11 @@ use crate::{ task_mgr::{TaskKind, WALRECEIVER_RUNTIME}, tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo}, walingest::WalIngest, - walrecord::{decode_wal_record, DecodedWALRecord}, }; use postgres_backend::is_expected_io_error; use postgres_connection::PgConnectionConfig; use postgres_ffi::waldecoder::WalStreamDecoder; +use postgres_ffi::walrecord::{decode_wal_record, DecodedWALRecord}; use utils::{id::NodeId, lsn::Lsn}; use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError}; diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 9e43e10801..27b3f93845 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -29,8 +29,10 @@ use std::time::Instant; use std::time::SystemTime; use pageserver_api::shard::ShardIdentity; +use postgres_ffi::walrecord::*; use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz}; use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn}; +use wal_decoder::models::*; use anyhow::{bail, Context, Result}; use bytes::{Buf, Bytes, BytesMut}; @@ -44,9 +46,9 @@ use crate::pgdatadir_mapping::{DatadirModification, Version}; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::PageReconstructError; use crate::tenant::Timeline; -use crate::walrecord::*; use crate::ZERO_PAGE; use pageserver_api::key::rel_block_to_key; +use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind}; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM}; @@ -108,143 +110,6 @@ struct WarnIngestLag { timestamp_invalid_msg_ratelimit: RateLimit, } -// These structs are an intermediary representation of the PostgreSQL WAL records. -// The ones prefixed with `Xl` are lower level, while the ones that are not have -// all the required context to be acted upon by the pageserver. - -enum HeapamRecord { - ClearVmBits(ClearVmBits), -} - -struct ClearVmBits { - new_heap_blkno: Option, - old_heap_blkno: Option, - vm_rel: RelTag, - flags: u8, -} - -enum NeonrmgrRecord { - ClearVmBits(ClearVmBits), -} - -enum SmgrRecord { - Create(SmgrCreate), - Truncate(XlSmgrTruncate), -} - -struct SmgrCreate { - rel: RelTag, -} - -enum DbaseRecord { - Create(DbaseCreate), - Drop(DbaseDrop), -} - -struct DbaseCreate { - db_id: u32, - tablespace_id: u32, - src_db_id: u32, - src_tablespace_id: u32, -} - -struct DbaseDrop { - db_id: u32, - tablespace_ids: Vec, -} - -enum ClogRecord { - ZeroPage(ClogZeroPage), - Truncate(ClogTruncate), -} - -struct ClogZeroPage { - segno: u32, - rpageno: u32, -} - -struct ClogTruncate { - pageno: u32, - oldest_xid: u32, - oldest_xid_db: u32, -} - -enum XactRecord { - Commit(XactCommon), - Abort(XactCommon), - CommitPrepared(XactCommon), - AbortPrepared(XactCommon), - Prepare(XactPrepare), -} - -struct XactCommon { - parsed: XlXactParsedRecord, - origin_id: u16, - // Fields below are only used for logging - xl_xid: u32, - lsn: Lsn, -} - -struct XactPrepare { - xl_xid: u32, - data: Bytes, -} - -enum MultiXactRecord { - ZeroPage(MultiXactZeroPage), - Create(XlMultiXactCreate), - Truncate(XlMultiXactTruncate), -} - -struct MultiXactZeroPage { - slru_kind: SlruKind, - segno: u32, - rpageno: u32, -} - -enum RelmapRecord { - Update(RelmapUpdate), -} - -struct RelmapUpdate { - update: XlRelmapUpdate, - buf: Bytes, -} - -enum XlogRecord { - Raw(RawXlogRecord), -} - -struct RawXlogRecord { - info: u8, - lsn: Lsn, - buf: Bytes, -} - -enum LogicalMessageRecord { - Put(PutLogicalMessage), - #[cfg(feature = "testing")] - Failpoint, -} - -struct PutLogicalMessage { - path: String, - buf: Bytes, -} - -enum StandbyRecord { - RunningXacts(StandbyRunningXacts), -} - -struct StandbyRunningXacts { - oldest_running_xid: u32, -} - -enum ReploriginRecord { - Set(XlReploriginSet), - Drop(XlReploriginDrop), -} - impl WalIngest { pub async fn new( timeline: &Timeline, @@ -284,7 +149,6 @@ impl WalIngest { /// relations/pages that the record affects. /// /// This function returns `true` if the record was ingested, and `false` if it was filtered out - /// pub async fn ingest_record( &mut self, decoded: DecodedWALRecord, @@ -2218,7 +2082,7 @@ impl WalIngest { ) -> anyhow::Result> { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_LOGICAL_MESSAGE { - let xlrec = crate::walrecord::XlLogicalMessage::decode(buf); + let xlrec = XlLogicalMessage::decode(buf); let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?; #[cfg(feature = "testing")] @@ -2246,7 +2110,7 @@ impl WalIngest { ) -> anyhow::Result> { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_RUNNING_XACTS { - let xlrec = crate::walrecord::XlRunningXacts::decode(buf); + let xlrec = XlRunningXacts::decode(buf); return Ok(Some(StandbyRecord::RunningXacts(StandbyRunningXacts { oldest_running_xid: xlrec.oldest_running_xid, }))); @@ -2276,10 +2140,10 @@ impl WalIngest { ) -> anyhow::Result> { let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK; if info == pg_constants::XLOG_REPLORIGIN_SET { - let xlrec = crate::walrecord::XlReploriginSet::decode(buf); + let xlrec = XlReploriginSet::decode(buf); return Ok(Some(ReploriginRecord::Set(xlrec))); } else if info == pg_constants::XLOG_REPLORIGIN_DROP { - let xlrec = crate::walrecord::XlReploriginDrop::decode(buf); + let xlrec = XlReploriginDrop::decode(buf); return Ok(Some(ReploriginRecord::Drop(xlrec))); } @@ -3146,6 +3010,7 @@ mod tests { async fn test_ingest_real_wal() { use crate::tenant::harness::*; use postgres_ffi::waldecoder::WalStreamDecoder; + use postgres_ffi::walrecord::decode_wal_record; use postgres_ffi::WAL_SEGMENT_SIZE; // Define test data path and constants. diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index a1c9fc5651..027a6eb7d7 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -29,11 +29,11 @@ use crate::metrics::{ WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME, }; -use crate::repository::Key; -use crate::walrecord::NeonWalRecord; use anyhow::Context; use bytes::{Bytes, BytesMut}; +use pageserver_api::key::Key; use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus}; +use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::TenantShardId; use std::future::Future; use std::sync::Arc; @@ -548,9 +548,10 @@ impl PostgresRedoManager { #[cfg(test)] mod tests { use super::PostgresRedoManager; - use crate::repository::Key; - use crate::{config::PageServerConf, walrecord::NeonWalRecord}; + use crate::config::PageServerConf; use bytes::Bytes; + use pageserver_api::key::Key; + use pageserver_api::record::NeonWalRecord; use pageserver_api::shard::TenantShardId; use std::str::FromStr; use tracing::Instrument; diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs index c067787f97..7aaa357318 100644 --- a/pageserver/src/walredo/apply_neon.rs +++ b/pageserver/src/walredo/apply_neon.rs @@ -1,8 +1,8 @@ -use crate::walrecord::NeonWalRecord; use anyhow::Context; use byteorder::{ByteOrder, LittleEndian}; use bytes::BytesMut; use pageserver_api::key::Key; +use pageserver_api::record::NeonWalRecord; use pageserver_api::reltag::SlruKind; use postgres_ffi::pg_constants; use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM; @@ -238,7 +238,7 @@ pub(crate) fn apply_in_neon( // No-op: this record will never be created in aux v2. warn!("AuxFile record should not be created in aux v2"); } - #[cfg(test)] + #[cfg(feature = "testing")] NeonWalRecord::Test { append, clear, diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs index f3197e68b5..7e9477cfbc 100644 --- a/pageserver/src/walredo/process.rs +++ b/pageserver/src/walredo/process.rs @@ -8,10 +8,10 @@ use crate::{ metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER}, page_cache::PAGE_SZ, span::debug_assert_current_span_has_tenant_id, - walrecord::NeonWalRecord, }; use anyhow::Context; use bytes::Bytes; +use pageserver_api::record::NeonWalRecord; use pageserver_api::{reltag::RelTag, shard::TenantShardId}; use postgres_ffi::BLCKSZ; #[cfg(feature = "testing")] diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 42f2a8efda..c87ae59fd6 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -16,6 +16,7 @@ OBJS = \ neon_walreader.o \ pagestore_smgr.o \ relsize_cache.o \ + unstable_extensions.o \ walproposer.o \ walproposer_pg.o \ control_plane_connector.o \ diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c index 4713103909..b47b22cd20 100644 --- a/pgxn/neon/control_plane_connector.c +++ b/pgxn/neon/control_plane_connector.c @@ -18,6 +18,7 @@ * *------------------------------------------------------------------------- */ + #include "postgres.h" #include @@ -508,6 +509,8 @@ NeonXactCallback(XactEvent event, void *arg) static bool RoleIsNeonSuperuser(const char *role_name) { + Assert(role_name); + return strcmp(role_name, "neon_superuser") == 0; } @@ -670,7 +673,7 @@ HandleCreateRole(CreateRoleStmt *stmt) static void HandleAlterRole(AlterRoleStmt *stmt) { - const char *role_name = stmt->role->rolename; + char *role_name; DefElem *dpass; ListCell *option; bool found = false; @@ -678,6 +681,7 @@ HandleAlterRole(AlterRoleStmt *stmt) InitRoleTableIfNeeded(); + role_name = get_rolespec_name(stmt->role); if (RoleIsNeonSuperuser(role_name) && !superuser()) elog(ERROR, "can't ALTER neon_superuser"); @@ -689,9 +693,13 @@ HandleAlterRole(AlterRoleStmt *stmt) if (strcmp(defel->defname, "password") == 0) dpass = defel; } + /* We only care about updates to the password */ if (!dpass) + { + pfree(role_name); return; + } entry = hash_search(CurrentDdlTable->role_table, role_name, @@ -704,6 +712,8 @@ HandleAlterRole(AlterRoleStmt *stmt) else entry->password = NULL; entry->type = Op_Set; + + pfree(role_name); } static void diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index f8ec725c18..dc87d79e87 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -30,6 +30,7 @@ #include "neon.h" #include "control_plane_connector.h" #include "logical_replication_monitor.h" +#include "unstable_extensions.h" #include "walsender_hooks.h" #if PG_MAJORVERSION_NUM >= 16 #include "storage/ipc.h" @@ -424,6 +425,7 @@ _PG_init(void) LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; + InitUnstableExtensionsSupport(); InitLogicalReplicationMonitor(); InitControlPlaneConnector(); diff --git a/pgxn/neon/neon_pgversioncompat.c b/pgxn/neon/neon_pgversioncompat.c index a0dbddde4b..7c404fb5a9 100644 --- a/pgxn/neon/neon_pgversioncompat.c +++ b/pgxn/neon/neon_pgversioncompat.c @@ -42,3 +42,4 @@ InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags) MemoryContextSwitchTo(old_context); } #endif + diff --git a/pgxn/neon/unstable_extensions.c b/pgxn/neon/unstable_extensions.c new file mode 100644 index 0000000000..72de2871f4 --- /dev/null +++ b/pgxn/neon/unstable_extensions.c @@ -0,0 +1,129 @@ +#include +#include + +#include "postgres.h" + +#include "nodes/plannodes.h" +#include "nodes/parsenodes.h" +#include "tcop/utility.h" +#include "utils/errcodes.h" +#include "utils/guc.h" + +#include "neon_pgversioncompat.h" +#include "unstable_extensions.h" + +static bool allow_unstable_extensions = false; +static char *unstable_extensions = NULL; + +static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL; + +static bool +list_contains(char const* comma_separated_list, char const* val) +{ + char const* occ = comma_separated_list; + size_t val_len = strlen(val); + + if (val_len == 0) + return false; + + while ((occ = strstr(occ, val)) != NULL) + { + if ((occ == comma_separated_list || occ[-1] == ',') + && (occ[val_len] == '\0' || occ[val_len] == ',')) + { + return true; + } + occ += val_len; + } + + return false; +} + + +static void +CheckUnstableExtension( + PlannedStmt *pstmt, + const char *queryString, + bool readOnlyTree, + ProcessUtilityContext context, + ParamListInfo params, + QueryEnvironment *queryEnv, + DestReceiver *dest, + QueryCompletion *qc) +{ + Node *parseTree = pstmt->utilityStmt; + + if (allow_unstable_extensions || unstable_extensions == NULL) + goto process; + + switch (nodeTag(parseTree)) + { + case T_CreateExtensionStmt: + { + CreateExtensionStmt *stmt = castNode(CreateExtensionStmt, parseTree); + if (list_contains(unstable_extensions, stmt->extname)) + { + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("%s extension is in beta and may be unstable or introduce backward-incompatible changes.\nWe recommend testing it in a separate, dedicated Neon project.", stmt->extname), + errhint("to proceed with installation, run SET neon.allow_unstable_extensions='true'"))); + } + break; + } + default: + goto process; + } + +process: + if (PreviousProcessUtilityHook) + { + PreviousProcessUtilityHook( + pstmt, + queryString, + readOnlyTree, + context, + params, + queryEnv, + dest, + qc); + } + else + { + standard_ProcessUtility( + pstmt, + queryString, + readOnlyTree, + context, + params, + queryEnv, + dest, + qc); + } +} + +void +InitUnstableExtensionsSupport(void) +{ + DefineCustomBoolVariable( + "neon.allow_unstable_extensions", + "Allow unstable extensions to be installed and used", + NULL, + &allow_unstable_extensions, + false, + PGC_USERSET, + 0, + NULL, NULL, NULL); + + DefineCustomStringVariable( + "neon.unstable_extensions", + "List of unstable extensions", + NULL, + &unstable_extensions, + NULL, + PGC_SUSET, + 0, + NULL, NULL, NULL); + + PreviousProcessUtilityHook = ProcessUtility_hook; + ProcessUtility_hook = CheckUnstableExtension; +} diff --git a/pgxn/neon/unstable_extensions.h b/pgxn/neon/unstable_extensions.h new file mode 100644 index 0000000000..3c695e9fb2 --- /dev/null +++ b/pgxn/neon/unstable_extensions.h @@ -0,0 +1,6 @@ +#ifndef __NEON_UNSTABLE_EXTENSIONS_H__ +#define __NEON_UNSTABLE_EXTENSIONS_H__ + +void InitUnstableExtensionsSupport(void); + +#endif diff --git a/poetry.lock b/poetry.lock index e307b873f3..36ea82a446 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -1521,6 +1521,21 @@ files = [ [package.dependencies] six = "*" +[[package]] +name = "jwcrypto" +version = "1.5.6" +description = "Implementation of JOSE Web standards" +optional = false +python-versions = ">= 3.8" +files = [ + {file = "jwcrypto-1.5.6-py3-none-any.whl", hash = "sha256:150d2b0ebbdb8f40b77f543fb44ffd2baeff48788be71f67f03566692fd55789"}, + {file = "jwcrypto-1.5.6.tar.gz", hash = "sha256:771a87762a0c081ae6166958a954f80848820b2ab066937dc8b8379d65b1b039"}, +] + +[package.dependencies] +cryptography = ">=3.4" +typing-extensions = ">=4.5.0" + [[package]] name = "kafka-python" version = "2.0.2" @@ -2111,7 +2126,6 @@ files = [ {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"}, {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"}, - {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"}, @@ -2120,8 +2134,6 @@ files = [ {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"}, {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"}, - {file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"}, - {file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"}, {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"}, @@ -2603,7 +2615,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2912,6 +2923,20 @@ files = [ {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] +[[package]] +name = "types-jwcrypto" +version = "1.5.0.20240925" +description = "Typing stubs for jwcrypto" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types-jwcrypto-1.5.0.20240925.tar.gz", hash = "sha256:50e17b790378c96239344476c7bd13b52d0c7eeb6d16c2d53723e48cc6bbf4fe"}, + {file = "types_jwcrypto-1.5.0.20240925-py3-none-any.whl", hash = "sha256:2d12a2d528240d326075e896aafec7056b9136bf3207fa6ccf3fcb8fbf9e11a1"}, +] + +[package.dependencies] +cryptography = "*" + [[package]] name = "types-psutil" version = "5.9.5.12" @@ -3118,13 +3143,13 @@ files = [ [[package]] name = "werkzeug" -version = "3.0.3" +version = "3.0.6" description = "The comprehensive WSGI web application library." optional = false python-versions = ">=3.8" files = [ - {file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"}, - {file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"}, + {file = "werkzeug-3.0.6-py3-none-any.whl", hash = "sha256:1bc0c2310d2fbb07b1dd1105eba2f7af72f322e1e455f2f93c993bee8c8a5f17"}, + {file = "werkzeug-3.0.6.tar.gz", hash = "sha256:a8dd59d4de28ca70471a34cba79bed5f7ef2e036a76b3ab0835474246eb41f8d"}, ] [package.dependencies] @@ -3159,16 +3184,6 @@ files = [ {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"}, - {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"}, - {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, @@ -3406,4 +3421,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "f52632571e34b0e51b059c280c35d6ff6f69f6a8c9586caca78282baf635be91" +content-hash = "ad5c9ee7723359af22bbd7fa41538dcf78913c02e947a13a8f9a87eb3a59039e" diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index 8ab8d5d37f..28bdacd769 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -1,5 +1,5 @@ use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::{info, warn}; +use tracing::{debug, info}; use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint}; use crate::auth::{self, AuthFlow}; @@ -21,7 +21,7 @@ pub(crate) async fn authenticate_cleartext( secret: AuthSecret, config: &'static AuthenticationConfig, ) -> auth::Result { - warn!("cleartext auth flow override is enabled, proceeding"); + debug!("cleartext auth flow override is enabled, proceeding"); ctx.set_auth_method(crate::context::AuthMethod::Cleartext); // pause the timer while we communicate with the client @@ -61,7 +61,7 @@ pub(crate) async fn password_hack_no_authentication( info: ComputeUserInfoNoEndpoint, client: &mut stream::PqStream>, ) -> auth::Result<(ComputeUserInfo, Vec)> { - warn!("project not specified, resorting to the password hack auth flow"); + debug!("project not specified, resorting to the password hack auth flow"); ctx.set_auth_method(crate::context::AuthMethod::Cleartext); // pause the timer while we communicate with the client diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs index 69ab4b8ccb..83c3617612 100644 --- a/proxy/src/auth/backend/jwt.rs +++ b/proxy/src/auth/backend/jwt.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::future::Future; use std::sync::Arc; use std::time::{Duration, SystemTime}; @@ -45,6 +46,7 @@ pub(crate) enum FetchAuthRulesError { RoleJwksNotConfigured, } +#[derive(Clone)] pub(crate) struct AuthRule { pub(crate) id: String, pub(crate) jwks_url: url::Url, @@ -277,7 +279,7 @@ impl JwkCacheEntryLock { // get the key from the JWKs if possible. If not, wait for the keys to update. let (jwk, expected_audience) = loop { - match guard.find_jwk_and_audience(kid, role_name) { + match guard.find_jwk_and_audience(&kid, role_name) { Some(jwk) => break jwk, None if guard.last_retrieved.elapsed() > MIN_RENEW => { let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute); @@ -312,7 +314,9 @@ impl JwkCacheEntryLock { if let Some(aud) = expected_audience { if payload.audience.0.iter().all(|s| s != aud) { - return Err(JwtError::InvalidJwtTokenAudience); + return Err(JwtError::InvalidClaims( + JwtClaimsError::InvalidJwtTokenAudience, + )); } } @@ -320,13 +324,15 @@ impl JwkCacheEntryLock { if let Some(exp) = payload.expiration { if now >= exp + CLOCK_SKEW_LEEWAY { - return Err(JwtError::JwtTokenHasExpired); + return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired)); } } if let Some(nbf) = payload.not_before { if nbf >= now + CLOCK_SKEW_LEEWAY { - return Err(JwtError::JwtTokenNotYetReadyToUse); + return Err(JwtError::InvalidClaims( + JwtClaimsError::JwtTokenNotYetReadyToUse, + )); } } @@ -420,8 +426,8 @@ struct JwtHeader<'a> { #[serde(rename = "alg")] algorithm: jose_jwa::Algorithm, /// key id, must be provided for our usecase - #[serde(rename = "kid")] - key_id: Option<&'a str>, + #[serde(rename = "kid", borrow)] + key_id: Option>, } /// @@ -440,17 +446,17 @@ struct JwtPayload<'a> { // the following entries are only extracted for the sake of debug logging. /// Issuer of the JWT - #[serde(rename = "iss")] - issuer: Option<&'a str>, + #[serde(rename = "iss", borrow)] + issuer: Option>, /// Subject of the JWT (the user) - #[serde(rename = "sub")] - subject: Option<&'a str>, + #[serde(rename = "sub", borrow)] + subject: Option>, /// Unique token identifier - #[serde(rename = "jti")] - jwt_id: Option<&'a str>, + #[serde(rename = "jti", borrow)] + jwt_id: Option>, /// Unique session identifier - #[serde(rename = "sid")] - session_id: Option<&'a str>, + #[serde(rename = "sid", borrow)] + session_id: Option>, } /// `OneOrMany` supports parsing either a single item or an array of items. @@ -585,14 +591,8 @@ pub(crate) enum JwtError { #[error("Provided authentication token is not a valid JWT encoding")] JwtEncoding(#[from] JwtEncodingError), - #[error("invalid JWT token audience")] - InvalidJwtTokenAudience, - - #[error("JWT token has expired")] - JwtTokenHasExpired, - - #[error("JWT token is not yet ready to use")] - JwtTokenNotYetReadyToUse, + #[error(transparent)] + InvalidClaims(#[from] JwtClaimsError), #[error("invalid P256 key")] InvalidP256Key(jose_jwk::crypto::Error), @@ -644,6 +644,19 @@ pub enum JwtEncodingError { InvalidCompactForm, } +#[derive(Error, Debug, PartialEq)] +#[non_exhaustive] +pub enum JwtClaimsError { + #[error("invalid JWT token audience")] + InvalidJwtTokenAudience, + + #[error("JWT token has expired")] + JwtTokenHasExpired, + + #[error("JWT token is not yet ready to use")] + JwtTokenNotYetReadyToUse, +} + #[allow(dead_code, reason = "Debug use only")] #[derive(Debug)] pub(crate) enum KeyType { @@ -680,6 +693,8 @@ mod tests { use hyper_util::rt::TokioIo; use rand::rngs::OsRng; use rsa::pkcs8::DecodePrivateKey; + use serde::Serialize; + use serde_json::json; use signature::Signer; use tokio::net::TcpListener; @@ -693,6 +708,7 @@ mod tests { key: jose_jwk::Key::Ec(pk), prm: jose_jwk::Parameters { kid: Some(kid), + alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)), ..Default::default() }, }; @@ -706,24 +722,47 @@ mod tests { key: jose_jwk::Key::Rsa(pk), prm: jose_jwk::Parameters { kid: Some(kid), + alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)), ..Default::default() }, }; (sk, jwk) } + fn now() -> u64 { + SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_secs() + } + fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String { + let now = now(); + let body = typed_json::json! {{ + "exp": now + 3600, + "nbf": now, + "aud": ["audience1", "neon", "audience2"], + "sub": "user1", + "sid": "session1", + "jti": "token1", + "iss": "neon-testing", + }}; + build_custom_jwt_payload(kid, body, sig) + } + + fn build_custom_jwt_payload( + kid: String, + body: impl Serialize, + sig: jose_jwa::Signing, + ) -> String { let header = JwtHeader { algorithm: jose_jwa::Algorithm::Signing(sig), - key_id: Some(&kid), + key_id: Some(Cow::Owned(kid)), }; - let body = typed_json::json! {{ - "exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600, - }}; let header = base64::encode_config(serde_json::to_string(&header).unwrap(), URL_SAFE_NO_PAD); - let body = base64::encode_config(body.to_string(), URL_SAFE_NO_PAD); + let body = base64::encode_config(serde_json::to_string(&body).unwrap(), URL_SAFE_NO_PAD); format!("{header}.{body}") } @@ -738,6 +777,16 @@ mod tests { format!("{payload}.{sig}") } + fn new_custom_ec_jwt(kid: String, key: &p256::SecretKey, body: impl Serialize) -> String { + use p256::ecdsa::{Signature, SigningKey}; + + let payload = build_custom_jwt_payload(kid, body, jose_jwa::Signing::Es256); + let sig: Signature = SigningKey::from(key).sign(payload.as_bytes()); + let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD); + + format!("{payload}.{sig}") + } + fn new_rsa_jwt(kid: String, key: rsa::RsaPrivateKey) -> String { use rsa::pkcs1v15::SigningKey; use rsa::signature::SignatureEncoding; @@ -809,37 +858,34 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL -----END PRIVATE KEY----- "; - #[tokio::test] - async fn renew() { - let (rs1, jwk1) = new_rsa_jwk(RS1, "1".into()); - let (rs2, jwk2) = new_rsa_jwk(RS2, "2".into()); - let (ec1, jwk3) = new_ec_jwk("3".into()); - let (ec2, jwk4) = new_ec_jwk("4".into()); + #[derive(Clone)] + struct Fetch(Vec); - let foo_jwks = jose_jwk::JwkSet { - keys: vec![jwk1, jwk3], - }; - let bar_jwks = jose_jwk::JwkSet { - keys: vec![jwk2, jwk4], - }; + impl FetchAuthRules for Fetch { + async fn fetch_auth_rules( + &self, + _ctx: &RequestMonitoring, + _endpoint: EndpointId, + ) -> Result, FetchAuthRulesError> { + Ok(self.0.clone()) + } + } + async fn jwks_server( + router: impl for<'a> Fn(&'a str) -> Option> + Send + Sync + 'static, + ) -> SocketAddr { + let router = Arc::new(router); let service = service_fn(move |req| { - let foo_jwks = foo_jwks.clone(); - let bar_jwks = bar_jwks.clone(); + let router = Arc::clone(&router); async move { - let jwks = match req.uri().path() { - "/foo" => &foo_jwks, - "/bar" => &bar_jwks, - _ => { - return Response::builder() - .status(404) - .body(Full::new(Bytes::new())); - } - }; - let body = serde_json::to_vec(jwks).unwrap(); - Response::builder() - .status(200) - .body(Full::new(Bytes::from(body))) + match router(req.uri().path()) { + Some(body) => Response::builder() + .status(200) + .body(Full::new(Bytes::from(body))), + None => Response::builder() + .status(404) + .body(Full::new(Bytes::new())), + } } }); @@ -854,84 +900,61 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL } }); - let client = reqwest::Client::new(); + addr + } - #[derive(Clone)] - struct Fetch(SocketAddr, Vec); + #[tokio::test] + async fn check_jwt_happy_path() { + let (rs1, jwk1) = new_rsa_jwk(RS1, "rs1".into()); + let (rs2, jwk2) = new_rsa_jwk(RS2, "rs2".into()); + let (ec1, jwk3) = new_ec_jwk("ec1".into()); + let (ec2, jwk4) = new_ec_jwk("ec2".into()); - impl FetchAuthRules for Fetch { - async fn fetch_auth_rules( - &self, - _ctx: &RequestMonitoring, - _endpoint: EndpointId, - ) -> Result, FetchAuthRulesError> { - Ok(vec![ - AuthRule { - id: "foo".to_owned(), - jwks_url: format!("http://{}/foo", self.0).parse().unwrap(), - audience: None, - role_names: self.1.clone(), - }, - AuthRule { - id: "bar".to_owned(), - jwks_url: format!("http://{}/bar", self.0).parse().unwrap(), - audience: None, - role_names: self.1.clone(), - }, - ]) - } - } + let foo_jwks = jose_jwk::JwkSet { + keys: vec![jwk1, jwk3], + }; + let bar_jwks = jose_jwk::JwkSet { + keys: vec![jwk2, jwk4], + }; + + let jwks_addr = jwks_server(move |path| match path { + "/foo" => Some(serde_json::to_vec(&foo_jwks).unwrap()), + "/bar" => Some(serde_json::to_vec(&bar_jwks).unwrap()), + _ => None, + }) + .await; let role_name1 = RoleName::from("anonymous"); let role_name2 = RoleName::from("authenticated"); - let fetch = Fetch( - addr, - vec![ - RoleNameInt::from(&role_name1), - RoleNameInt::from(&role_name2), - ], - ); + let roles = vec![ + RoleNameInt::from(&role_name1), + RoleNameInt::from(&role_name2), + ]; + let rules = vec![ + AuthRule { + id: "foo".to_owned(), + jwks_url: format!("http://{jwks_addr}/foo").parse().unwrap(), + audience: None, + role_names: roles.clone(), + }, + AuthRule { + id: "bar".to_owned(), + jwks_url: format!("http://{jwks_addr}/bar").parse().unwrap(), + audience: None, + role_names: roles.clone(), + }, + ]; + + let fetch = Fetch(rules); + let jwk_cache = JwkCache::default(); let endpoint = EndpointId::from("ep"); - let jwk_cache = Arc::new(JwkCacheEntryLock::default()); - - let jwt1 = new_rsa_jwt("1".into(), rs1); - let jwt2 = new_rsa_jwt("2".into(), rs2); - let jwt3 = new_ec_jwt("3".into(), &ec1); - let jwt4 = new_ec_jwt("4".into(), &ec2); - - // had the wrong kid, therefore will have the wrong ecdsa signature - let bad_jwt = new_ec_jwt("3".into(), &ec2); - // this role_name is not accepted - let bad_role_name = RoleName::from("cloud_admin"); - - let err = jwk_cache - .check_jwt( - &RequestMonitoring::test(), - &bad_jwt, - &client, - endpoint.clone(), - &role_name1, - &fetch, - ) - .await - .unwrap_err(); - assert!(err.to_string().contains("signature error")); - - let err = jwk_cache - .check_jwt( - &RequestMonitoring::test(), - &jwt1, - &client, - endpoint.clone(), - &bad_role_name, - &fetch, - ) - .await - .unwrap_err(); - assert!(err.to_string().contains("jwk not found")); + let jwt1 = new_rsa_jwt("rs1".into(), rs1); + let jwt2 = new_rsa_jwt("rs2".into(), rs2); + let jwt3 = new_ec_jwt("ec1".into(), &ec1); + let jwt4 = new_ec_jwt("ec2".into(), &ec2); let tokens = [jwt1, jwt2, jwt3, jwt4]; let role_names = [role_name1, role_name2]; @@ -940,15 +963,250 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL jwk_cache .check_jwt( &RequestMonitoring::test(), - token, - &client, endpoint.clone(), role, &fetch, + token, ) .await .unwrap(); } } } + + /// AWS Cognito escapes the `/` in the URL. + #[tokio::test] + async fn check_jwt_regression_cognito_issuer() { + let (key, jwk) = new_ec_jwk("key".into()); + + let now = now(); + let token = new_custom_ec_jwt( + "key".into(), + &key, + typed_json::json! {{ + "sub": "dd9a73fd-e785-4a13-aae1-e691ce43e89d", + // cognito uses `\/`. I cannot replicated that easily here as serde_json will refuse + // to write that escape character. instead I will make a bogus URL using `\` instead. + "iss": "https:\\\\cognito-idp.us-west-2.amazonaws.com\\us-west-2_abcdefgh", + "client_id": "abcdefghijklmnopqrstuvwxyz", + "origin_jti": "6759d132-3fe7-446e-9e90-2fe7e8017893", + "event_id": "ec9c36ab-b01d-46a0-94e4-87fde6767065", + "token_use": "access", + "scope": "aws.cognito.signin.user.admin", + "auth_time":now, + "exp":now + 60, + "iat":now, + "jti": "b241614b-0b93-4bdc-96db-0a3c7061d9c0", + "username": "dd9a73fd-e785-4a13-aae1-e691ce43e89d", + }}, + ); + + let jwks = jose_jwk::JwkSet { keys: vec![jwk] }; + + let jwks_addr = jwks_server(move |_path| Some(serde_json::to_vec(&jwks).unwrap())).await; + + let role_name = RoleName::from("anonymous"); + let rules = vec![AuthRule { + id: "aws-cognito".to_owned(), + jwks_url: format!("http://{jwks_addr}/").parse().unwrap(), + audience: None, + role_names: vec![RoleNameInt::from(&role_name)], + }]; + + let fetch = Fetch(rules); + let jwk_cache = JwkCache::default(); + + let endpoint = EndpointId::from("ep"); + + jwk_cache + .check_jwt( + &RequestMonitoring::test(), + endpoint.clone(), + &role_name, + &fetch, + &token, + ) + .await + .unwrap(); + } + + #[tokio::test] + async fn check_jwt_invalid_signature() { + let (_, jwk) = new_ec_jwk("1".into()); + let (key, _) = new_ec_jwk("1".into()); + + // has a matching kid, but signed by the wrong key + let bad_jwt = new_ec_jwt("1".into(), &key); + + let jwks = jose_jwk::JwkSet { keys: vec![jwk] }; + let jwks_addr = jwks_server(move |path| match path { + "/" => Some(serde_json::to_vec(&jwks).unwrap()), + _ => None, + }) + .await; + + let role = RoleName::from("authenticated"); + + let rules = vec![AuthRule { + id: String::new(), + jwks_url: format!("http://{jwks_addr}/").parse().unwrap(), + audience: None, + role_names: vec![RoleNameInt::from(&role)], + }]; + + let fetch = Fetch(rules); + let jwk_cache = JwkCache::default(); + + let ep = EndpointId::from("ep"); + + let ctx = RequestMonitoring::test(); + let err = jwk_cache + .check_jwt(&ctx, ep, &role, &fetch, &bad_jwt) + .await + .unwrap_err(); + assert!( + matches!(err, JwtError::Signature(_)), + "expected \"signature error\", got {err:?}" + ); + } + + #[tokio::test] + async fn check_jwt_unknown_role() { + let (key, jwk) = new_rsa_jwk(RS1, "1".into()); + let jwt = new_rsa_jwt("1".into(), key); + + let jwks = jose_jwk::JwkSet { keys: vec![jwk] }; + let jwks_addr = jwks_server(move |path| match path { + "/" => Some(serde_json::to_vec(&jwks).unwrap()), + _ => None, + }) + .await; + + let role = RoleName::from("authenticated"); + let rules = vec![AuthRule { + id: String::new(), + jwks_url: format!("http://{jwks_addr}/").parse().unwrap(), + audience: None, + role_names: vec![RoleNameInt::from(&role)], + }]; + + let fetch = Fetch(rules); + let jwk_cache = JwkCache::default(); + + let ep = EndpointId::from("ep"); + + // this role_name is not accepted + let bad_role_name = RoleName::from("cloud_admin"); + + let ctx = RequestMonitoring::test(); + let err = jwk_cache + .check_jwt(&ctx, ep, &bad_role_name, &fetch, &jwt) + .await + .unwrap_err(); + + assert!( + matches!(err, JwtError::JwkNotFound), + "expected \"jwk not found\", got {err:?}" + ); + } + + #[tokio::test] + async fn check_jwt_invalid_claims() { + let (key, jwk) = new_ec_jwk("1".into()); + + let jwks = jose_jwk::JwkSet { keys: vec![jwk] }; + let jwks_addr = jwks_server(move |path| match path { + "/" => Some(serde_json::to_vec(&jwks).unwrap()), + _ => None, + }) + .await; + + let now = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap() + .as_secs(); + + struct Test { + body: serde_json::Value, + error: JwtClaimsError, + } + + let table = vec![ + Test { + body: json! {{ + "nbf": now + 60, + "aud": "neon", + }}, + error: JwtClaimsError::JwtTokenNotYetReadyToUse, + }, + Test { + body: json! {{ + "exp": now - 60, + "aud": ["neon"], + }}, + error: JwtClaimsError::JwtTokenHasExpired, + }, + Test { + body: json! {{ + }}, + error: JwtClaimsError::InvalidJwtTokenAudience, + }, + Test { + body: json! {{ + "aud": [], + }}, + error: JwtClaimsError::InvalidJwtTokenAudience, + }, + Test { + body: json! {{ + "aud": "foo", + }}, + error: JwtClaimsError::InvalidJwtTokenAudience, + }, + Test { + body: json! {{ + "aud": ["foo"], + }}, + error: JwtClaimsError::InvalidJwtTokenAudience, + }, + Test { + body: json! {{ + "aud": ["foo", "bar"], + }}, + error: JwtClaimsError::InvalidJwtTokenAudience, + }, + ]; + + let role = RoleName::from("authenticated"); + + let rules = vec![AuthRule { + id: String::new(), + jwks_url: format!("http://{jwks_addr}/").parse().unwrap(), + audience: Some("neon".to_string()), + role_names: vec![RoleNameInt::from(&role)], + }]; + + let fetch = Fetch(rules); + let jwk_cache = JwkCache::default(); + + let ep = EndpointId::from("ep"); + + let ctx = RequestMonitoring::test(); + for test in table { + let jwt = new_custom_ec_jwt("1".into(), &key, test.body); + + match jwk_cache + .check_jwt(&ctx, ep.clone(), &role, &fetch, &jwt) + .await + { + Err(JwtError::InvalidClaims(error)) if error == test.error => {} + Err(err) => { + panic!("expected {:?}, got {err:?}", test.error) + } + Ok(_payload) => { + panic!("expected {:?}, got ok", test.error) + } + } + } + } } diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index 6e190029aa..82c259efc8 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -137,9 +137,6 @@ struct ProxyCliArgs { /// size of the threadpool for password hashing #[clap(long, default_value_t = 4)] scram_thread_pool_size: u8, - /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour. - #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)] - disable_dynamic_rate_limiter: bool, /// Endpoint rate limiter max number of requests per second. /// /// Provided in the form `@`. @@ -615,9 +612,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { and metric-collection-interval must be specified" ), }; - if !args.disable_dynamic_rate_limiter { - bail!("dynamic rate limiter should be disabled"); - } let config::ConcurrencyLockOptions { shards, diff --git a/proxy/src/serverless/error.rs b/proxy/src/serverless/error.rs new file mode 100644 index 0000000000..323c91baa5 --- /dev/null +++ b/proxy/src/serverless/error.rs @@ -0,0 +1,5 @@ +use http::StatusCode; + +pub trait HttpCodeError { + fn get_http_status_code(&self) -> StatusCode; +} diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs index 8fb7a771d9..edbb0347d3 100644 --- a/proxy/src/serverless/mod.rs +++ b/proxy/src/serverless/mod.rs @@ -6,6 +6,7 @@ mod backend; pub mod cancel_set; mod conn_pool; mod conn_pool_lib; +mod error; mod http_conn_pool; mod http_util; mod json; diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 1f3eec6d19..0713c27d65 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -28,6 +28,7 @@ use uuid::Uuid; use super::backend::{LocalProxyConnError, PoolingBackend}; use super::conn_pool::{AuthData, ConnInfoWithAuth}; use super::conn_pool_lib::{self, ConnInfo}; +use super::error::HttpCodeError; use super::http_util::json_response; use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError}; use super::local_conn_pool; @@ -238,7 +239,6 @@ fn get_conn_info( Ok(ConnInfoWithAuth { conn_info, auth }) } -// TODO: return different http error codes pub(crate) async fn handle( config: &'static ProxyConfig, ctx: RequestMonitoring, @@ -319,9 +319,8 @@ pub(crate) async fn handle( "forwarding error to user" ); - // TODO: this shouldn't always be bad request. json_response( - StatusCode::BAD_REQUEST, + e.get_http_status_code(), json!({ "message": message, "code": code, @@ -405,6 +404,25 @@ impl UserFacingError for SqlOverHttpError { } } +impl HttpCodeError for SqlOverHttpError { + fn get_http_status_code(&self) -> StatusCode { + match self { + SqlOverHttpError::ReadPayload(_) => StatusCode::BAD_REQUEST, + SqlOverHttpError::ConnectCompute(h) => match h.get_error_kind() { + ErrorKind::User => StatusCode::BAD_REQUEST, + _ => StatusCode::INTERNAL_SERVER_ERROR, + }, + SqlOverHttpError::ConnInfo(_) => StatusCode::BAD_REQUEST, + SqlOverHttpError::RequestTooLarge(_) => StatusCode::PAYLOAD_TOO_LARGE, + SqlOverHttpError::ResponseTooLarge(_) => StatusCode::INSUFFICIENT_STORAGE, + SqlOverHttpError::InvalidIsolationLevel => StatusCode::BAD_REQUEST, + SqlOverHttpError::Postgres(_) => StatusCode::BAD_REQUEST, + SqlOverHttpError::JsonConversion(_) => StatusCode::INTERNAL_SERVER_ERROR, + SqlOverHttpError::Cancelled(_) => StatusCode::INTERNAL_SERVER_ERROR, + } + } +} + #[derive(Debug, thiserror::Error)] pub(crate) enum ReadPayloadError { #[error("could not read the HTTP request body: {0}")] diff --git a/pyproject.toml b/pyproject.toml index 862ed49638..faa5f9123c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ backoff = "^2.2.1" pytest-lazy-fixture = "^0.6.3" prometheus-client = "^0.14.1" pytest-timeout = "^2.1.0" -Werkzeug = "^3.0.3" +Werkzeug = "^3.0.6" pytest-order = "^1.1.0" allure-pytest = "^2.13.2" pytest-asyncio = "^0.21.0" @@ -42,6 +42,9 @@ pytest-repeat = "^0.9.3" websockets = "^12.0" clickhouse-connect = "^0.7.16" kafka-python = "^2.0.2" +jwcrypto = "^1.5.6" +h2 = "^4.1.0" +types-jwcrypto = "^1.5.0.20240925" [tool.poetry.group.dev.dependencies] mypy = "==1.3.0" diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs index b4590fe3e5..df68f8a68e 100644 --- a/safekeeper/src/http/routes.rs +++ b/safekeeper/src/http/routes.rs @@ -262,14 +262,6 @@ async fn timeline_snapshot_handler(request: Request) -> Result, // so create the chan and write to it in another task. diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs index c7f5165f90..c700e18cc7 100644 --- a/safekeeper/src/pull_timeline.rs +++ b/safekeeper/src/pull_timeline.rs @@ -8,6 +8,7 @@ use serde::{Deserialize, Serialize}; use std::{ cmp::min, io::{self, ErrorKind}, + sync::Arc, }; use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task}; use tokio_tar::{Archive, Builder, Header}; @@ -25,8 +26,8 @@ use crate::{ routes::TimelineStatus, }, safekeeper::Term, - state::TimelinePersistentState, - timeline::WalResidentTimeline, + state::{EvictionState, TimelinePersistentState}, + timeline::{Timeline, WalResidentTimeline}, timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline}, wal_backup, wal_storage::open_wal_file, @@ -43,18 +44,33 @@ use utils::{ /// Stream tar archive of timeline to tx. #[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))] pub async fn stream_snapshot( - tli: WalResidentTimeline, + tli: Arc, source: NodeId, destination: NodeId, tx: mpsc::Sender>, ) { - if let Err(e) = stream_snapshot_guts(tli, source, destination, tx.clone()).await { - // Error type/contents don't matter as they won't can't reach the client - // (hyper likely doesn't do anything with it), but http stream will be - // prematurely terminated. It would be nice to try to send the error in - // trailers though. - tx.send(Err(anyhow!("snapshot failed"))).await.ok(); - error!("snapshot failed: {:#}", e); + match tli.try_wal_residence_guard().await { + Err(e) => { + tx.send(Err(anyhow!("Error checking residence: {:#}", e))) + .await + .ok(); + } + Ok(maybe_resident_tli) => { + if let Err(e) = match maybe_resident_tli { + Some(resident_tli) => { + stream_snapshot_resident_guts(resident_tli, source, destination, tx.clone()) + .await + } + None => stream_snapshot_offloaded_guts(tli, source, destination, tx.clone()).await, + } { + // Error type/contents don't matter as they won't can't reach the client + // (hyper likely doesn't do anything with it), but http stream will be + // prematurely terminated. It would be nice to try to send the error in + // trailers though. + tx.send(Err(anyhow!("snapshot failed"))).await.ok(); + error!("snapshot failed: {:#}", e); + } + } } } @@ -80,12 +96,10 @@ impl Drop for SnapshotContext { } } -pub async fn stream_snapshot_guts( - tli: WalResidentTimeline, - source: NodeId, - destination: NodeId, +/// Build a tokio_tar stream that sends encoded bytes into a Bytes channel. +fn prepare_tar_stream( tx: mpsc::Sender>, -) -> Result<()> { +) -> tokio_tar::Builder { // tokio-tar wants Write implementor, but we have mpsc tx >; // use SinkWriter as a Write impl. That is, // - create Sink from the tx. It returns PollSendError if chan is closed. @@ -100,12 +114,38 @@ pub async fn stream_snapshot_guts( // - SinkWriter (not surprisingly) wants sink of &[u8], not bytes, so wrap // into CopyToBytes. This is a data copy. let copy_to_bytes = CopyToBytes::new(oksink); - let mut writer = SinkWriter::new(copy_to_bytes); - let pinned_writer = std::pin::pin!(writer); + let writer = SinkWriter::new(copy_to_bytes); + let pinned_writer = Box::pin(writer); // Note that tokio_tar append_* funcs use tokio::io::copy with 8KB buffer // which is also likely suboptimal. - let mut ar = Builder::new_non_terminated(pinned_writer); + Builder::new_non_terminated(pinned_writer) +} + +/// Implementation of snapshot for an offloaded timeline, only reads control file +pub(crate) async fn stream_snapshot_offloaded_guts( + tli: Arc, + source: NodeId, + destination: NodeId, + tx: mpsc::Sender>, +) -> Result<()> { + let mut ar = prepare_tar_stream(tx); + + tli.snapshot_offloaded(&mut ar, source, destination).await?; + + ar.finish().await?; + + Ok(()) +} + +/// Implementation of snapshot for a timeline which is resident (includes some segment data) +pub async fn stream_snapshot_resident_guts( + tli: WalResidentTimeline, + source: NodeId, + destination: NodeId, + tx: mpsc::Sender>, +) -> Result<()> { + let mut ar = prepare_tar_stream(tx); let bctx = tli.start_snapshot(&mut ar, source, destination).await?; pausable_failpoint!("sk-snapshot-after-list-pausable"); @@ -138,6 +178,70 @@ pub async fn stream_snapshot_guts( Ok(()) } +impl Timeline { + /// Simple snapshot for an offloaded timeline: we will only upload a renamed partial segment and + /// pass a modified control file into the provided tar stream (nothing with data segments on disk, since + /// we are offloaded and there aren't any) + async fn snapshot_offloaded( + self: &Arc, + ar: &mut tokio_tar::Builder, + source: NodeId, + destination: NodeId, + ) -> Result<()> { + // Take initial copy of control file, then release state lock + let mut control_file = { + let shared_state = self.write_shared_state().await; + + let control_file = TimelinePersistentState::clone(shared_state.sk.state()); + + // Rare race: we got unevicted between entering function and reading control file. + // We error out and let API caller retry. + if !matches!(control_file.eviction_state, EvictionState::Offloaded(_)) { + bail!("Timeline was un-evicted during snapshot, please retry"); + } + + control_file + }; + + // Modify the partial segment of the in-memory copy for the control file to + // point to the destination safekeeper. + let replace = control_file + .partial_backup + .replace_uploaded_segment(source, destination)?; + + let Some(replace) = replace else { + // In Manager:: ready_for_eviction, we do not permit eviction unless the timeline + // has a partial segment. It is unexpected that + anyhow::bail!("Timeline has no partial segment, cannot generate snapshot"); + }; + + tracing::info!("Replacing uploaded partial segment in in-mem control file: {replace:?}"); + + // Optimistically try to copy the partial segment to the destination's path: this + // can fail if the timeline was un-evicted and modified in the background. + let remote_timeline_path = &self.remote_path; + wal_backup::copy_partial_segment( + &replace.previous.remote_path(remote_timeline_path), + &replace.current.remote_path(remote_timeline_path), + ) + .await?; + + // Since the S3 copy succeeded with the path given in our control file snapshot, and + // we are sending that snapshot in our response, we are giving the caller a consistent + // snapshot even if our local Timeline was unevicted or otherwise modified in the meantime. + let buf = control_file + .write_to_buf() + .with_context(|| "failed to serialize control store")?; + let mut header = Header::new_gnu(); + header.set_size(buf.len().try_into().expect("never breaches u64")); + ar.append_data(&mut header, CONTROL_FILE_NAME, buf.as_slice()) + .await + .with_context(|| "failed to append to archive")?; + + Ok(()) + } +} + impl WalResidentTimeline { /// Start streaming tar archive with timeline: /// 1) stream control file under lock; diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs index 3dbf72298f..f97e127a17 100644 --- a/safekeeper/src/receive_wal.rs +++ b/safekeeper/src/receive_wal.rs @@ -21,18 +21,15 @@ use postgres_backend::QueryError; use pq_proto::BeMessage; use serde::Deserialize; use serde::Serialize; +use std::future; use std::net::SocketAddr; use std::sync::Arc; use tokio::io::AsyncRead; use tokio::io::AsyncWrite; -use tokio::sync::mpsc::channel; -use tokio::sync::mpsc::error::TryRecvError; -use tokio::sync::mpsc::Receiver; -use tokio::sync::mpsc::Sender; +use tokio::sync::mpsc::{channel, Receiver, Sender}; use tokio::task; use tokio::task::JoinHandle; -use tokio::time::Duration; -use tokio::time::Instant; +use tokio::time::{Duration, MissedTickBehavior}; use tracing::*; use utils::id::TenantTimelineId; use utils::lsn::Lsn; @@ -444,9 +441,9 @@ async fn network_write( } } -// Send keepalive messages to walproposer, to make sure it receives updates -// even when it writes a steady stream of messages. -const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1); +/// The WAL flush interval. This ensures we periodically flush the WAL and send AppendResponses to +/// walproposer, even when it's writing a steady stream of messages. +const FLUSH_INTERVAL: Duration = Duration::from_secs(1); /// Encapsulates a task which takes messages from msg_rx, processes and pushes /// replies to reply_tx. @@ -494,67 +491,76 @@ impl WalAcceptor { async fn run(&mut self) -> anyhow::Result<()> { let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id); - // After this timestamp we will stop processing AppendRequests and send a response - // to the walproposer. walproposer sends at least one AppendRequest per second, - // we will send keepalives by replying to these requests once per second. - let mut next_keepalive = Instant::now(); + // Periodically flush the WAL. + let mut flush_ticker = tokio::time::interval(FLUSH_INTERVAL); + flush_ticker.set_missed_tick_behavior(MissedTickBehavior::Delay); + flush_ticker.tick().await; // skip the initial, immediate tick - while let Some(mut next_msg) = self.msg_rx.recv().await { - // Update walreceiver state in shmem for reporting. - if let ProposerAcceptorMessage::Elected(_) = &next_msg { - walreceiver_guard.get().status = WalReceiverStatus::Streaming; - } + // Tracks unflushed appends. + let mut dirty = false; - let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) { - // Loop through AppendRequests while available to write as many WAL records as - // possible without fsyncing. - // - // Make sure the WAL is flushed before returning, see: - // https://github.com/neondatabase/neon/issues/9259 - // - // Note: this will need to be rewritten if we want to read non-AppendRequest messages here. - // Otherwise, we might end up in a situation where we read a message, but don't - // process it. - while let ProposerAcceptorMessage::AppendRequest(append_request) = next_msg { - let noflush_msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request); - - if let Some(reply) = self.tli.process_msg(&noflush_msg).await? { - if self.reply_tx.send(reply).await.is_err() { - break; // disconnected, flush WAL and return on next send/recv - } - } - - // get out of this loop if keepalive time is reached - if Instant::now() >= next_keepalive { + loop { + let reply = tokio::select! { + // Process inbound message. + msg = self.msg_rx.recv() => { + // If disconnected, break to flush WAL and return. + let Some(mut msg) = msg else { break; + }; + + // Update walreceiver state in shmem for reporting. + if let ProposerAcceptorMessage::Elected(_) = &msg { + walreceiver_guard.get().status = WalReceiverStatus::Streaming; } - // continue pulling AppendRequests if available - match self.msg_rx.try_recv() { - Ok(msg) => next_msg = msg, - Err(TryRecvError::Empty) => break, - // on disconnect, flush WAL and return on next send/recv - Err(TryRecvError::Disconnected) => break, - }; + // Don't flush the WAL on every append, only periodically via flush_ticker. + // This batches multiple appends per fsync. If the channel is empty after + // sending the reply, we'll schedule an immediate flush. + if let ProposerAcceptorMessage::AppendRequest(append_request) = msg { + msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request); + dirty = true; + } + + self.tli.process_msg(&msg).await? } - // flush all written WAL to the disk - self.tli - .process_msg(&ProposerAcceptorMessage::FlushWAL) - .await? - } else { - // process message other than AppendRequest - self.tli.process_msg(&next_msg).await? + // While receiving AppendRequests, flush the WAL periodically and respond with an + // AppendResponse to let walproposer know we're still alive. + _ = flush_ticker.tick(), if dirty => { + dirty = false; + self.tli + .process_msg(&ProposerAcceptorMessage::FlushWAL) + .await? + } + + // If there are no pending messages, flush the WAL immediately. + // + // TODO: this should be done via flush_ticker.reset_immediately(), but that's always + // delayed by 1ms due to this bug: https://github.com/tokio-rs/tokio/issues/6866. + _ = future::ready(()), if dirty && self.msg_rx.is_empty() => { + dirty = false; + flush_ticker.reset(); + self.tli + .process_msg(&ProposerAcceptorMessage::FlushWAL) + .await? + } }; - if let Some(reply) = reply_msg { + // Send reply, if any. + if let Some(reply) = reply { if self.reply_tx.send(reply).await.is_err() { - return Ok(()); // chan closed, streaming terminated + break; // disconnected, break to flush WAL and return } - // reset keepalive time - next_keepalive = Instant::now() + KEEPALIVE_INTERVAL; } } + + // Flush WAL on disconnect, see https://github.com/neondatabase/neon/issues/9259. + if dirty { + self.tli + .process_msg(&ProposerAcceptorMessage::FlushWAL) + .await?; + } + Ok(()) } } diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index c737dfcf9b..f0113978c4 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -797,14 +797,17 @@ impl Timeline { state.sk.term_bump(to).await } - /// Get the timeline guard for reading/writing WAL files. - /// If WAL files are not present on disk (evicted), they will be automatically - /// downloaded from remote storage. This is done in the manager task, which is - /// responsible for issuing all guards. - /// - /// NB: don't use this function from timeline_manager, it will deadlock. - /// NB: don't use this function while holding shared_state lock. - pub async fn wal_residence_guard(self: &Arc) -> Result { + /// Guts of [`Self::wal_residence_guard`] and [`Self::try_wal_residence_guard`] + async fn do_wal_residence_guard( + self: &Arc, + block: bool, + ) -> Result> { + let op_label = if block { + "wal_residence_guard" + } else { + "try_wal_residence_guard" + }; + if self.is_cancelled() { bail!(TimelineError::Cancelled(self.ttid)); } @@ -816,10 +819,13 @@ impl Timeline { // Wait 30 seconds for the guard to be acquired. It can time out if someone is // holding the lock (e.g. during `SafeKeeper::process_msg()`) or manager task // is stuck. - let res = tokio::time::timeout_at( - started_at + Duration::from_secs(30), - self.manager_ctl.wal_residence_guard(), - ) + let res = tokio::time::timeout_at(started_at + Duration::from_secs(30), async { + if block { + self.manager_ctl.wal_residence_guard().await.map(Some) + } else { + self.manager_ctl.try_wal_residence_guard().await + } + }) .await; let guard = match res { @@ -827,14 +833,14 @@ impl Timeline { let finished_at = Instant::now(); let elapsed = finished_at - started_at; MISC_OPERATION_SECONDS - .with_label_values(&["wal_residence_guard"]) + .with_label_values(&[op_label]) .observe(elapsed.as_secs_f64()); guard } Ok(Err(e)) => { warn!( - "error while acquiring WalResidentTimeline guard, statuses {:?} => {:?}", + "error acquiring in {op_label}, statuses {:?} => {:?}", status_before, self.mgr_status.get() ); @@ -842,7 +848,7 @@ impl Timeline { } Err(_) => { warn!( - "timeout while acquiring WalResidentTimeline guard, statuses {:?} => {:?}", + "timeout acquiring in {op_label} guard, statuses {:?} => {:?}", status_before, self.mgr_status.get() ); @@ -850,7 +856,28 @@ impl Timeline { } }; - Ok(WalResidentTimeline::new(self.clone(), guard)) + Ok(guard.map(|g| WalResidentTimeline::new(self.clone(), g))) + } + + /// Get the timeline guard for reading/writing WAL files. + /// If WAL files are not present on disk (evicted), they will be automatically + /// downloaded from remote storage. This is done in the manager task, which is + /// responsible for issuing all guards. + /// + /// NB: don't use this function from timeline_manager, it will deadlock. + /// NB: don't use this function while holding shared_state lock. + pub async fn wal_residence_guard(self: &Arc) -> Result { + self.do_wal_residence_guard(true) + .await + .map(|m| m.expect("Always get Some in block=true mode")) + } + + /// Get the timeline guard for reading/writing WAL files if the timeline is resident, + /// else return None + pub(crate) async fn try_wal_residence_guard( + self: &Arc, + ) -> Result> { + self.do_wal_residence_guard(false).await } pub async fn backup_partial_reset(self: &Arc) -> Result> { diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index f5363ae9b0..303421c837 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -56,6 +56,9 @@ impl Manager { // This also works for the first segment despite last_removed_segno // being 0 on init because this 0 triggers run of wal_removal_task // on success of which manager updates the horizon. + // + // **Note** pull_timeline functionality assumes that evicted timelines always have + // a partial segment: if we ever change this condition, must also update that code. && self .partial_backup_uploaded .as_ref() diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index f0583dd3ff..79200fff8d 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -100,6 +100,8 @@ const REFRESH_INTERVAL: Duration = Duration::from_millis(300); pub enum ManagerCtlMessage { /// Request to get a guard for WalResidentTimeline, with WAL files available locally. GuardRequest(tokio::sync::oneshot::Sender>), + /// Get a guard for WalResidentTimeline if the timeline is not currently offloaded, else None + TryGuardRequest(tokio::sync::oneshot::Sender>), /// Request to drop the guard. GuardDrop(GuardId), /// Request to reset uploaded partial backup state. @@ -110,6 +112,7 @@ impl std::fmt::Debug for ManagerCtlMessage { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"), + ManagerCtlMessage::TryGuardRequest(_) => write!(f, "TryGuardRequest"), ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id), ManagerCtlMessage::BackupPartialReset(_) => write!(f, "BackupPartialReset"), } @@ -152,6 +155,19 @@ impl ManagerCtl { .and_then(std::convert::identity) } + /// Issue a new guard if the timeline is currently not offloaded, else return None + /// Sends a message to the manager and waits for the response. + /// Can be blocked indefinitely if the manager is stuck. + pub async fn try_wal_residence_guard(&self) -> anyhow::Result> { + let (tx, rx) = tokio::sync::oneshot::channel(); + self.manager_tx + .send(ManagerCtlMessage::TryGuardRequest(tx))?; + + // wait for the manager to respond with the guard + rx.await + .map_err(|e| anyhow::anyhow!("response read fail: {:?}", e)) + } + /// Request timeline manager to reset uploaded partial segment state and /// wait for the result. pub async fn backup_partial_reset(&self) -> anyhow::Result> { @@ -674,6 +690,17 @@ impl Manager { warn!("failed to reply with a guard, receiver dropped"); } } + Some(ManagerCtlMessage::TryGuardRequest(tx)) => { + let result = if self.is_offloaded { + None + } else { + Some(self.access_service.create_guard()) + }; + + if tx.send(result).is_err() { + warn!("failed to reply with a guard, receiver dropped"); + } + } Some(ManagerCtlMessage::GuardDrop(guard_id)) => { self.access_service.drop_guard(guard_id); } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index afefe8598c..face3d2c2d 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -968,6 +968,28 @@ async fn handle_tenant_shard_migrate( ) } +async fn handle_tenant_shard_cancel_reconcile( + service: Arc, + req: Request, +) -> Result, ApiError> { + check_permissions(&req, Scope::Admin)?; + + let req = match maybe_forward(req).await { + ForwardOutcome::Forwarded(res) => { + return res; + } + ForwardOutcome::NotForwarded(req) => req, + }; + + let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?; + json_response( + StatusCode::OK, + service + .tenant_shard_cancel_reconcile(tenant_shard_id) + .await?, + ) +} + async fn handle_tenant_update_policy(req: Request) -> Result, ApiError> { check_permissions(&req, Scope::Admin)?; @@ -1776,6 +1798,16 @@ pub fn make_router( RequestName("control_v1_tenant_migrate"), ) }) + .put( + "/control/v1/tenant/:tenant_shard_id/cancel_reconcile", + |r| { + tenant_service_handler( + r, + handle_tenant_shard_cancel_reconcile, + RequestName("control_v1_tenant_cancel_reconcile"), + ) + }, + ) .put("/control/v1/tenant/:tenant_id/shard_split", |r| { tenant_service_handler( r, diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index a2a6e63dd2..3f6cbfef59 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -4834,6 +4834,43 @@ impl Service { Ok(TenantShardMigrateResponse {}) } + /// 'cancel' in this context means cancel any ongoing reconcile + pub(crate) async fn tenant_shard_cancel_reconcile( + &self, + tenant_shard_id: TenantShardId, + ) -> Result<(), ApiError> { + // Take state lock and fire the cancellation token, after which we drop lock and wait for any ongoing reconcile to complete + let waiter = { + let locked = self.inner.write().unwrap(); + let Some(shard) = locked.tenants.get(&tenant_shard_id) else { + return Err(ApiError::NotFound( + anyhow::anyhow!("Tenant shard not found").into(), + )); + }; + + let waiter = shard.get_waiter(); + match waiter { + None => { + tracing::info!("Shard does not have an ongoing Reconciler"); + return Ok(()); + } + Some(waiter) => { + tracing::info!("Cancelling Reconciler"); + shard.cancel_reconciler(); + waiter + } + } + }; + + // Cancellation should be prompt. If this fails we have still done our job of firing the + // cancellation token, but by returning an ApiError we will indicate to the caller that + // the Reconciler is misbehaving and not respecting the cancellation token + self.await_waiters(vec![waiter], SHORT_RECONCILE_TIMEOUT) + .await?; + + Ok(()) + } + /// This is for debug/support only: we simply drop all state for a tenant, without /// detaching or deleting it on pageservers. pub(crate) async fn tenant_drop(&self, tenant_id: TenantId) -> Result<(), ApiError> { @@ -4921,16 +4958,7 @@ impl Service { stripe_size, }, placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking - - // There is no way to know what the tenant's config was: revert to defaults - // - // TODO: remove `switch_aux_file_policy` once we finish auxv2 migration - // - // we write to both v1+v2 storage, so that the test case can use either storage format for testing - config: TenantConfig { - switch_aux_file_policy: Some(models::AuxFilePolicy::CrossValidation), - ..TenantConfig::default() - }, + config: TenantConfig::default(), }) .await?; diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index e696c72ba7..27c97d3b86 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -1317,6 +1317,12 @@ impl TenantShard { }) } + pub(crate) fn cancel_reconciler(&self) { + if let Some(handle) = self.reconciler.as_ref() { + handle.cancel.cancel() + } + } + /// Get a waiter for any reconciliation in flight, but do not start reconciliation /// if it is not already running pub(crate) fn get_waiter(&self) -> Option { diff --git a/test_runner/conftest.py b/test_runner/conftest.py index 4a3194c691..84eda52d33 100644 --- a/test_runner/conftest.py +++ b/test_runner/conftest.py @@ -3,6 +3,7 @@ from __future__ import annotations pytest_plugins = ( "fixtures.pg_version", "fixtures.parametrize", + "fixtures.h2server", "fixtures.httpserver", "fixtures.compute_reconfigure", "fixtures.storage_controller_proxy", diff --git a/test_runner/fixtures/h2server.py b/test_runner/fixtures/h2server.py new file mode 100644 index 0000000000..92783e1fb2 --- /dev/null +++ b/test_runner/fixtures/h2server.py @@ -0,0 +1,198 @@ +""" +https://python-hyper.org/projects/hyper-h2/en/stable/asyncio-example.html + +auth-broker -> local-proxy needs a h2 connection, so we need a h2 server :) +""" + +import asyncio +import collections +import io +import json +from collections.abc import AsyncIterable + +import pytest_asyncio +from h2.config import H2Configuration +from h2.connection import H2Connection +from h2.errors import ErrorCodes +from h2.events import ( + ConnectionTerminated, + DataReceived, + RemoteSettingsChanged, + RequestReceived, + StreamEnded, + StreamReset, + WindowUpdated, +) +from h2.exceptions import ProtocolError, StreamClosedError +from h2.settings import SettingCodes + +RequestData = collections.namedtuple("RequestData", ["headers", "data"]) + + +class H2Server: + def __init__(self, host, port) -> None: + self.host = host + self.port = port + + +class H2Protocol(asyncio.Protocol): + def __init__(self): + config = H2Configuration(client_side=False, header_encoding="utf-8") + self.conn = H2Connection(config=config) + self.transport = None + self.stream_data = {} + self.flow_control_futures = {} + + def connection_made(self, transport: asyncio.Transport): # type: ignore[override] + self.transport = transport + self.conn.initiate_connection() + self.transport.write(self.conn.data_to_send()) + + def connection_lost(self, _exc): + for future in self.flow_control_futures.values(): + future.cancel() + self.flow_control_futures = {} + + def data_received(self, data: bytes): + assert self.transport is not None + try: + events = self.conn.receive_data(data) + except ProtocolError: + self.transport.write(self.conn.data_to_send()) + self.transport.close() + else: + self.transport.write(self.conn.data_to_send()) + for event in events: + if isinstance(event, RequestReceived): + self.request_received(event.headers, event.stream_id) + elif isinstance(event, DataReceived): + self.receive_data(event.data, event.stream_id) + elif isinstance(event, StreamEnded): + self.stream_complete(event.stream_id) + elif isinstance(event, ConnectionTerminated): + self.transport.close() + elif isinstance(event, StreamReset): + self.stream_reset(event.stream_id) + elif isinstance(event, WindowUpdated): + self.window_updated(event.stream_id, event.delta) + elif isinstance(event, RemoteSettingsChanged): + if SettingCodes.INITIAL_WINDOW_SIZE in event.changed_settings: + self.window_updated(None, 0) + + self.transport.write(self.conn.data_to_send()) + + def request_received(self, headers: list[tuple[str, str]], stream_id: int): + headers_map = collections.OrderedDict(headers) + + # Store off the request data. + request_data = RequestData(headers_map, io.BytesIO()) + self.stream_data[stream_id] = request_data + + def stream_complete(self, stream_id: int): + """ + When a stream is complete, we can send our response. + """ + try: + request_data = self.stream_data[stream_id] + except KeyError: + # Just return, we probably 405'd this already + return + + headers = request_data.headers + body = request_data.data.getvalue().decode("utf-8") + + data = json.dumps({"headers": headers, "body": body}, indent=4).encode("utf8") + + response_headers = ( + (":status", "200"), + ("content-type", "application/json"), + ("content-length", str(len(data))), + ) + self.conn.send_headers(stream_id, response_headers) + asyncio.ensure_future(self.send_data(data, stream_id)) + + def receive_data(self, data: bytes, stream_id: int): + """ + We've received some data on a stream. If that stream is one we're + expecting data on, save it off. Otherwise, reset the stream. + """ + try: + stream_data = self.stream_data[stream_id] + except KeyError: + self.conn.reset_stream(stream_id, error_code=ErrorCodes.PROTOCOL_ERROR) + else: + stream_data.data.write(data) + + def stream_reset(self, stream_id): + """ + A stream reset was sent. Stop sending data. + """ + if stream_id in self.flow_control_futures: + future = self.flow_control_futures.pop(stream_id) + future.cancel() + + async def send_data(self, data, stream_id): + """ + Send data according to the flow control rules. + """ + while data: + while self.conn.local_flow_control_window(stream_id) < 1: + try: + await self.wait_for_flow_control(stream_id) + except asyncio.CancelledError: + return + + chunk_size = min( + self.conn.local_flow_control_window(stream_id), + len(data), + self.conn.max_outbound_frame_size, + ) + + try: + self.conn.send_data( + stream_id, data[:chunk_size], end_stream=(chunk_size == len(data)) + ) + except (StreamClosedError, ProtocolError): + # The stream got closed and we didn't get told. We're done + # here. + break + + assert self.transport is not None + self.transport.write(self.conn.data_to_send()) + data = data[chunk_size:] + + async def wait_for_flow_control(self, stream_id): + """ + Waits for a Future that fires when the flow control window is opened. + """ + f: asyncio.Future[None] = asyncio.Future() + self.flow_control_futures[stream_id] = f + await f + + def window_updated(self, stream_id, delta): + """ + A window update frame was received. Unblock some number of flow control + Futures. + """ + if stream_id and stream_id in self.flow_control_futures: + f = self.flow_control_futures.pop(stream_id) + f.set_result(delta) + elif not stream_id: + for f in self.flow_control_futures.values(): + f.set_result(delta) + + self.flow_control_futures = {} + + +@pytest_asyncio.fixture(scope="function") +async def http2_echoserver() -> AsyncIterable[H2Server]: + loop = asyncio.get_event_loop() + serve = await loop.create_server(H2Protocol, "127.0.0.1", 0) + (host, port) = serve.sockets[0].getsockname() + + asyncio.create_task(serve.wait_closed()) + + server = H2Server(host, port) + yield server + + serve.close() diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py index 1b2767e296..d220ea57a2 100644 --- a/test_runner/fixtures/neon_cli.py +++ b/test_runner/fixtures/neon_cli.py @@ -16,7 +16,6 @@ from fixtures.common_types import Lsn, TenantId, TimelineId from fixtures.log_helper import log from fixtures.pageserver.common_types import IndexPartDump from fixtures.pg_version import PgVersion -from fixtures.utils import AuxFileStore if TYPE_CHECKING: from typing import ( @@ -201,7 +200,6 @@ class NeonLocalCli(AbstractNeonCli): shard_stripe_size: Optional[int] = None, placement_policy: Optional[str] = None, set_default: bool = False, - aux_file_policy: Optional[AuxFileStore] = None, ): """ Creates a new tenant, returns its id and its initial timeline's id. @@ -223,13 +221,6 @@ class NeonLocalCli(AbstractNeonCli): ) ) - if aux_file_policy is AuxFileStore.V2: - args.extend(["-c", "switch_aux_file_policy:v2"]) - elif aux_file_policy is AuxFileStore.V1: - args.extend(["-c", "switch_aux_file_policy:v1"]) - elif aux_file_policy is AuxFileStore.CrossValidation: - args.extend(["-c", "switch_aux_file_policy:cross-validation"]) - if set_default: args.append("--set-default") diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 6491069f20..1b9bc873f4 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -35,11 +35,13 @@ import toml from _pytest.config import Config from _pytest.config.argparsing import Parser from _pytest.fixtures import FixtureRequest +from jwcrypto import jwk # Type-related stuff from psycopg2.extensions import connection as PgConnection from psycopg2.extensions import cursor as PgCursor from psycopg2.extensions import make_dsn, parse_dsn +from pytest_httpserver import HTTPServer from urllib3.util.retry import Retry from fixtures import overlayfs @@ -53,6 +55,7 @@ from fixtures.common_types import ( TimelineId, ) from fixtures.endpoint.http import EndpointHttpClient +from fixtures.h2server import H2Server from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics from fixtures.neon_cli import NeonLocalCli, Pagectl @@ -94,7 +97,6 @@ from fixtures.utils import ( subprocess_capture, wait_until, ) -from fixtures.utils import AuxFileStore as AuxFileStore # reexport from .neon_api import NeonAPI, NeonApiEndpoint @@ -353,7 +355,6 @@ class NeonEnvBuilder: initial_tenant: Optional[TenantId] = None, initial_timeline: Optional[TimelineId] = None, pageserver_virtual_file_io_engine: Optional[str] = None, - pageserver_aux_file_policy: Optional[AuxFileStore] = None, pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = None, safekeeper_extra_opts: Optional[list[str]] = None, storage_controller_port_override: Optional[int] = None, @@ -405,8 +406,6 @@ class NeonEnvBuilder: f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}" ) - self.pageserver_aux_file_policy = pageserver_aux_file_policy - self.safekeeper_extra_opts = safekeeper_extra_opts self.storage_controller_port_override = storage_controller_port_override @@ -467,7 +466,6 @@ class NeonEnvBuilder: timeline_id=env.initial_timeline, shard_count=initial_tenant_shard_count, shard_stripe_size=initial_tenant_shard_stripe_size, - aux_file_policy=self.pageserver_aux_file_policy, ) assert env.initial_tenant == initial_tenant assert env.initial_timeline == initial_timeline @@ -1027,7 +1025,6 @@ class NeonEnv: self.control_plane_compute_hook_api = config.control_plane_compute_hook_api self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine - self.pageserver_aux_file_policy = config.pageserver_aux_file_policy self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode # Create the neon_local's `NeonLocalInitConf` @@ -1323,7 +1320,6 @@ class NeonEnv: shard_stripe_size: Optional[int] = None, placement_policy: Optional[str] = None, set_default: bool = False, - aux_file_policy: Optional[AuxFileStore] = None, ) -> tuple[TenantId, TimelineId]: """ Creates a new tenant, returns its id and its initial timeline's id. @@ -1340,7 +1336,6 @@ class NeonEnv: shard_stripe_size=shard_stripe_size, placement_policy=placement_policy, set_default=set_default, - aux_file_policy=aux_file_policy, ) return tenant_id, timeline_id @@ -1398,7 +1393,6 @@ def neon_simple_env( compatibility_pg_distrib_dir: Path, pg_version: PgVersion, pageserver_virtual_file_io_engine: str, - pageserver_aux_file_policy: Optional[AuxFileStore], pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]], pageserver_virtual_file_io_mode: Optional[str], ) -> Iterator[NeonEnv]: @@ -1431,7 +1425,6 @@ def neon_simple_env( test_name=request.node.name, test_output_dir=test_output_dir, pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine, - pageserver_aux_file_policy=pageserver_aux_file_policy, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode, combination=combination, @@ -1458,7 +1451,6 @@ def neon_env_builder( top_output_dir: Path, pageserver_virtual_file_io_engine: str, pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]], - pageserver_aux_file_policy: Optional[AuxFileStore], record_property: Callable[[str, object], None], pageserver_virtual_file_io_mode: Optional[str], ) -> Iterator[NeonEnvBuilder]: @@ -1501,7 +1493,6 @@ def neon_env_builder( test_name=request.node.name, test_output_dir=test_output_dir, test_overlay_dir=test_overlay_dir, - pageserver_aux_file_policy=pageserver_aux_file_policy, pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm, pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode, ) as builder: @@ -3092,6 +3083,31 @@ class PSQL: ) +def generate_proxy_tls_certs(common_name: str, key_path: Path, crt_path: Path): + if not key_path.exists(): + r = subprocess.run( + [ + "openssl", + "req", + "-new", + "-x509", + "-days", + "365", + "-nodes", + "-text", + "-out", + str(crt_path), + "-keyout", + str(key_path), + "-subj", + f"/CN={common_name}", + "-addext", + f"subjectAltName = DNS:{common_name}", + ] + ) + assert r.returncode == 0 + + class NeonProxy(PgProtocol): link_auth_uri: str = "http://dummy-uri" @@ -3190,29 +3206,7 @@ class NeonProxy(PgProtocol): # generate key of it doesn't exist crt_path = self.test_output_dir / "proxy.crt" key_path = self.test_output_dir / "proxy.key" - - if not key_path.exists(): - r = subprocess.run( - [ - "openssl", - "req", - "-new", - "-x509", - "-days", - "365", - "-nodes", - "-text", - "-out", - str(crt_path), - "-keyout", - str(key_path), - "-subj", - "/CN=*.localtest.me", - "-addext", - "subjectAltName = DNS:*.localtest.me", - ] - ) - assert r.returncode == 0 + generate_proxy_tls_certs("*.localtest.me", key_path, crt_path) args = [ str(self.neon_binpath / "proxy"), @@ -3392,6 +3386,125 @@ class NeonProxy(PgProtocol): assert out == "ok" +class NeonAuthBroker: + class ControlPlane: + def __init__(self, endpoint: str): + self.endpoint = endpoint + + def extra_args(self) -> list[str]: + args = [ + *["--auth-backend", "console"], + *["--auth-endpoint", self.endpoint], + ] + return args + + def __init__( + self, + neon_binpath: Path, + test_output_dir: Path, + http_port: int, + mgmt_port: int, + external_http_port: int, + auth_backend: NeonAuthBroker.ControlPlane, + ): + self.domain = "apiauth.localtest.me" # resolves to 127.0.0.1 + self.host = "127.0.0.1" + self.http_port = http_port + self.external_http_port = external_http_port + self.neon_binpath = neon_binpath + self.test_output_dir = test_output_dir + self.mgmt_port = mgmt_port + self.auth_backend = auth_backend + self.http_timeout_seconds = 15 + self._popen: Optional[subprocess.Popen[bytes]] = None + + def start(self) -> NeonAuthBroker: + assert self._popen is None + + # generate key of it doesn't exist + crt_path = self.test_output_dir / "proxy.crt" + key_path = self.test_output_dir / "proxy.key" + generate_proxy_tls_certs("apiauth.localtest.me", key_path, crt_path) + + args = [ + str(self.neon_binpath / "proxy"), + *["--http", f"{self.host}:{self.http_port}"], + *["--mgmt", f"{self.host}:{self.mgmt_port}"], + *["--wss", f"{self.host}:{self.external_http_port}"], + *["-c", str(crt_path)], + *["-k", str(key_path)], + *["--sql-over-http-pool-opt-in", "false"], + *["--is-auth-broker", "true"], + *self.auth_backend.extra_args(), + ] + + logfile = open(self.test_output_dir / "proxy.log", "w") + self._popen = subprocess.Popen(args, stdout=logfile, stderr=logfile) + self._wait_until_ready() + return self + + # Sends SIGTERM to the proxy if it has been started + def terminate(self): + if self._popen: + self._popen.terminate() + + # Waits for proxy to exit if it has been opened with a default timeout of + # two seconds. Raises subprocess.TimeoutExpired if the proxy does not exit in time. + def wait_for_exit(self, timeout=2): + if self._popen: + self._popen.wait(timeout=timeout) + + @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10) + def _wait_until_ready(self): + assert ( + self._popen and self._popen.poll() is None + ), "Proxy exited unexpectedly. Check test log." + requests.get(f"http://{self.host}:{self.http_port}/v1/status") + + async def query(self, query, args, **kwargs): + user = kwargs["user"] + token = kwargs["token"] + expected_code = kwargs.get("expected_code") + + log.info(f"Executing http query: {query}") + + connstr = f"postgresql://{user}@{self.domain}/postgres" + async with httpx.AsyncClient(verify=str(self.test_output_dir / "proxy.crt")) as client: + response = await client.post( + f"https://{self.domain}:{self.external_http_port}/sql", + json={"query": query, "params": args}, + headers={ + "Neon-Connection-String": connstr, + "Authorization": f"Bearer {token}", + }, + ) + + if expected_code is not None: + assert response.status_code == expected_code, f"response: {response.json()}" + return response.json() + + def get_metrics(self) -> str: + request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics") + return request_result.text + + def __enter__(self) -> NeonAuthBroker: + return self + + def __exit__( + self, + _exc_type: Optional[type[BaseException]], + _exc_value: Optional[BaseException], + _traceback: Optional[TracebackType], + ): + if self._popen is not None: + self._popen.terminate() + try: + self._popen.wait(timeout=5) + except subprocess.TimeoutExpired: + log.warning("failed to gracefully terminate proxy; killing") + self._popen.kill() + + @pytest.fixture(scope="function") def link_proxy( port_distributor: PortDistributor, neon_binpath: Path, test_output_dir: Path @@ -3456,6 +3569,74 @@ def static_proxy( yield proxy +@pytest.fixture(scope="function") +def neon_authorize_jwk() -> jwk.JWK: + kid = str(uuid.uuid4()) + key = jwk.JWK.generate(kty="RSA", size=2048, alg="RS256", use="sig", kid=kid) + assert isinstance(key, jwk.JWK) + return key + + +@pytest.fixture(scope="function") +def static_auth_broker( + port_distributor: PortDistributor, + neon_binpath: Path, + test_output_dir: Path, + httpserver: HTTPServer, + neon_authorize_jwk: jwk.JWK, + http2_echoserver: H2Server, +) -> Iterable[NeonAuthBroker]: + """Neon Auth Broker that routes to a mocked local_proxy and a mocked cplane HTTP API.""" + + local_proxy_addr = f"{http2_echoserver.host}:{http2_echoserver.port}" + + # return local_proxy addr on ProxyWakeCompute. + httpserver.expect_request("/cplane/proxy_wake_compute").respond_with_json( + { + "address": local_proxy_addr, + "aux": { + "endpoint_id": "ep-foo-bar-1234", + "branch_id": "br-foo-bar", + "project_id": "foo-bar", + }, + } + ) + + # return jwks mock addr on GetEndpointJwks + httpserver.expect_request(re.compile("^/cplane/endpoints/.+/jwks$")).respond_with_json( + { + "jwks": [ + { + "id": "foo", + "jwks_url": httpserver.url_for("/authorize/jwks.json"), + "provider_name": "test", + "jwt_audience": None, + "role_names": ["anonymous", "authenticated"], + } + ] + } + ) + + # return static fixture jwks. + jwk = neon_authorize_jwk.export_public(as_dict=True) + httpserver.expect_request("/authorize/jwks.json").respond_with_json({"keys": [jwk]}) + + mgmt_port = port_distributor.get_port() + http_port = port_distributor.get_port() + external_http_port = port_distributor.get_port() + + with NeonAuthBroker( + neon_binpath=neon_binpath, + test_output_dir=test_output_dir, + http_port=http_port, + mgmt_port=mgmt_port, + external_http_port=external_http_port, + auth_backend=NeonAuthBroker.ControlPlane(httpserver.url_for("/cplane")), + ) as proxy: + proxy.start() + yield proxy + + class Endpoint(PgProtocol, LogUtils): """An object representing a Postgres compute endpoint managed by the control plane.""" diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py index 4114c2fcb3..1131bf090f 100644 --- a/test_runner/fixtures/parametrize.py +++ b/test_runner/fixtures/parametrize.py @@ -10,12 +10,6 @@ from _pytest.python import Metafunc from fixtures.pg_version import PgVersion -if TYPE_CHECKING: - from typing import Any, Optional - - from fixtures.utils import AuxFileStore - - if TYPE_CHECKING: from typing import Any, Optional @@ -50,11 +44,6 @@ def pageserver_virtual_file_io_mode() -> Optional[str]: return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE") -@pytest.fixture(scope="function", autouse=True) -def pageserver_aux_file_policy() -> Optional[AuxFileStore]: - return None - - def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict[str, Any]]: toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM") if toml_table is None: diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index d12fa59abc..01b7cf1026 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -1,7 +1,6 @@ from __future__ import annotations import contextlib -import enum import json import os import re @@ -515,21 +514,6 @@ def assert_no_errors(log_file: Path, service: str, allowed_errors: list[str]): assert not errors, f"First log error on {service}: {errors[0]}\nHint: use scripts/check_allowed_errors.sh to test any new allowed_error you add" -@enum.unique -class AuxFileStore(str, enum.Enum): - V1 = "v1" - V2 = "v2" - CrossValidation = "cross-validation" - - @override - def __repr__(self) -> str: - return f"'aux-{self.value}'" - - @override - def __str__(self) -> str: - return f"'aux-{self.value}'" - - def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str]): """ This is essentially: diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index 815d186ab9..8b2a296bdd 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -9,7 +9,7 @@ import pytest from fixtures.benchmark_fixture import MetricReport from fixtures.common_types import Lsn from fixtures.log_helper import log -from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync +from fixtures.neon_fixtures import logical_replication_sync if TYPE_CHECKING: from fixtures.benchmark_fixture import NeonBenchmarker @@ -17,7 +17,6 @@ if TYPE_CHECKING: from fixtures.neon_fixtures import NeonEnv, PgBin -@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2]) @pytest.mark.timeout(1000) def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg): env = neon_simple_env diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 4a7017994d..83d003a5cc 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -172,7 +172,6 @@ def test_fully_custom_config(positive_env: NeonEnv): }, "walreceiver_connect_timeout": "13m", "image_layer_creation_check_threshold": 1, - "switch_aux_file_policy": "cross-validation", "lsn_lease_length": "1m", "lsn_lease_length_for_ts": "5s", } diff --git a/test_runner/regress/test_auth_broker.py b/test_runner/regress/test_auth_broker.py new file mode 100644 index 0000000000..11dc7d56b5 --- /dev/null +++ b/test_runner/regress/test_auth_broker.py @@ -0,0 +1,37 @@ +import json + +import pytest +from fixtures.neon_fixtures import NeonAuthBroker +from jwcrypto import jwk, jwt + + +@pytest.mark.asyncio +async def test_auth_broker_happy( + static_auth_broker: NeonAuthBroker, + neon_authorize_jwk: jwk.JWK, +): + """ + Signs a JWT and uses it to authorize a query to local_proxy. + """ + + token = jwt.JWT( + header={"kid": neon_authorize_jwk.key_id, "alg": "RS256"}, claims={"sub": "user1"} + ) + token.make_signed_token(neon_authorize_jwk) + res = await static_auth_broker.query("foo", ["arg1"], user="anonymous", token=token.serialize()) + + # local proxy mock just echos back the request + # check that we forward the correct data + + assert ( + res["headers"]["authorization"] == f"Bearer {token.serialize()}" + ), "JWT should be forwarded" + + assert ( + "anonymous" in res["headers"]["neon-connection-string"] + ), "conn string should be forwarded" + + assert json.loads(res["body"]) == { + "query": "foo", + "params": ["arg1"], + }, "Query body should be forwarded" diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py index 96657b3ce4..e517e83e6f 100644 --- a/test_runner/regress/test_ddl_forwarding.py +++ b/test_runner/regress/test_ddl_forwarding.py @@ -7,6 +7,7 @@ import psycopg2 import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, VanillaPostgres +from psycopg2.errors import UndefinedObject from pytest_httpserver import HTTPServer from werkzeug.wrappers.request import Request from werkzeug.wrappers.response import Response @@ -335,3 +336,34 @@ def test_ddl_forwarding_invalid_db(neon_simple_env: NeonEnv): if not result: raise AssertionError("Could not count databases") assert result[0] == 0, "Database 'failure' still exists after restart" + + +def test_ddl_forwarding_role_specs(neon_simple_env: NeonEnv): + """ + Postgres has a concept of role specs: + + ROLESPEC_CSTRING: ALTER ROLE xyz + ROLESPEC_CURRENT_USER: ALTER ROLE current_user + ROLESPEC_CURRENT_ROLE: ALTER ROLE current_role + ROLESPEC_SESSION_USER: ALTER ROLE session_user + ROLESPEC_PUBLIC: ALTER ROLE public + + The extension is required to serialize these special role spec into + usernames for the purpose of DDL forwarding. + """ + env = neon_simple_env + + endpoint = env.endpoints.create_start("main") + + with endpoint.cursor() as cur: + # ROLESPEC_CSTRING + cur.execute("ALTER ROLE cloud_admin WITH PASSWORD 'york'") + # ROLESPEC_CURRENT_USER + cur.execute("ALTER ROLE current_user WITH PASSWORD 'pork'") + # ROLESPEC_CURRENT_ROLE + cur.execute("ALTER ROLE current_role WITH PASSWORD 'cork'") + # ROLESPEC_SESSION_USER + cur.execute("ALTER ROLE session_user WITH PASSWORD 'bork'") + # ROLESPEC_PUBLIC + with pytest.raises(UndefinedObject): + cur.execute("ALTER ROLE public WITH PASSWORD 'dork'") diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index c26bf058e2..30027463df 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -5,11 +5,9 @@ from functools import partial from random import choice from string import ascii_lowercase -import pytest from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( - AuxFileStore, NeonEnv, NeonEnvBuilder, PgProtocol, @@ -23,17 +21,6 @@ def random_string(n: int): return "".join([choice(ascii_lowercase) for _ in range(n)]) -@pytest.mark.parametrize( - "pageserver_aux_file_policy", [AuxFileStore.V2, AuxFileStore.CrossValidation] -) -def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy: AuxFileStore): - env = neon_simple_env - with env.pageserver.http_client() as client: - tenant_config = client.tenant_config(env.initial_tenant).effective_config - assert pageserver_aux_file_policy == tenant_config["switch_aux_file_policy"] - - -@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation]) def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env @@ -173,7 +160,6 @@ COMMIT; # Test that neon.logical_replication_max_snap_files works -@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation]) def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg): def slot_removed(ep): assert ( @@ -350,7 +336,6 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of # # Most pages start with a contrecord, so we don't do anything special # to ensure that. -@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation]) def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env @@ -395,7 +380,6 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg): # logical replication bug as such, but without logical replication, # records passed ot the WAL redo process are never large enough to hit # the bug. -@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation]) def test_large_records(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env @@ -467,7 +451,6 @@ def test_slots_and_branching(neon_simple_env: NeonEnv): ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')") -@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation]) def test_replication_shutdown(neon_simple_env: NeonEnv): # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed env = neon_simple_env diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index f598900af9..e59d46e352 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -561,7 +561,7 @@ def test_sql_over_http_pool_dos(static_proxy: NeonProxy): # query generates a million rows - should hit the 10MB reponse limit quickly response = query( - 400, + 507, "select * from generate_series(1, 5000) a cross join generate_series(1, 5000) b cross join (select 'foo'::foo) c;", ) assert "response is too large (max is 10485760 bytes)" in response["message"] diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index d4bc4b1a4f..c8de292588 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -18,6 +18,7 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + NeonPageserver, PageserverAvailability, PageserverSchedulingPolicy, PgBin, @@ -298,17 +299,20 @@ def test_storage_controller_restart(neon_env_builder: NeonEnvBuilder): env.storage_controller.consistency_check() -@pytest.mark.parametrize("warm_up", [True, False]) -def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool): +def prepare_onboarding_env( + neon_env_builder: NeonEnvBuilder, +) -> tuple[NeonEnv, NeonPageserver, TenantId, int]: """ - We onboard tenants to the sharding service by treating it as a 'virtual pageserver' - which provides the /location_config API. This is similar to creating a tenant, - but imports the generation number. + For tests that do onboarding of a tenant to the storage controller, a small dance to + set up one pageserver that won't be managed by the storage controller and create + a tenant there. """ - # One pageserver to simulate legacy environment, two to be managed by storage controller neon_env_builder.num_pageservers = 3 + # Enable tests to use methods that require real S3 API + neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + # Start services by hand so that we can skip registration on one of the pageservers env = neon_env_builder.init_configs() env.broker.start() @@ -329,7 +333,6 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up # will be attached after onboarding env.pageservers[1].start() env.pageservers[2].start() - virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) for sk in env.safekeepers: sk.start() @@ -339,6 +342,23 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up generation = 123 origin_ps.tenant_create(tenant_id, generation=generation) + origin_ps.http_client().timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate()) + + return (env, origin_ps, tenant_id, generation) + + +@pytest.mark.parametrize("warm_up", [True, False]) +def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool): + """ + We onboard tenants to the sharding service by treating it as a 'virtual pageserver' + which provides the /location_config API. This is similar to creating a tenant, + but imports the generation number. + """ + + env, origin_ps, tenant_id, generation = prepare_onboarding_env(neon_env_builder) + + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + # As if doing a live migration, first configure origin into stale mode r = origin_ps.http_client().tenant_location_conf( tenant_id, @@ -475,6 +495,70 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up env.storage_controller.consistency_check() +@run_only_on_default_postgres("this test doesn't start an endpoint") +def test_storage_controller_onboard_detached(neon_env_builder: NeonEnvBuilder): + """ + Sometimes, the control plane wants to delete a tenant that wasn't attached to any pageserver, + and also wasn't ever registered with the storage controller. + + It may do this by calling /location_conf in mode Detached and then calling the delete API + as normal. + """ + + env, origin_ps, tenant_id, generation = prepare_onboarding_env(neon_env_builder) + + remote_prefix = "/".join( + ( + "tenants", + str(tenant_id), + ) + ) + + # Detach it from its original pageserver. + origin_ps.http_client().tenant_location_conf( + tenant_id, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + + # Since we will later assert that remote data is gone, as a control also check it was ever there + assert_prefix_not_empty( + neon_env_builder.pageserver_remote_storage, + prefix=remote_prefix, + ) + + # Register with storage controller in Detached state + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + generation += 1 + r = virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": generation, + }, + ) + assert len(r["shards"]) == 0 # location_conf tells us there are no attached shards + + # Onboarding in Detached state shouldn't have attached it to any pageserver + for ps in env.pageservers: + assert ps.http_client().tenant_list() == [] + + # Delete it via the storage controller + virtual_ps_http.tenant_delete(tenant_id) + + # Check that we really deleted it + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix=remote_prefix, + ) + + def test_storage_controller_compute_hook( httpserver: HTTPServer, neon_env_builder: NeonEnvBuilder, @@ -872,6 +956,14 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3 assert all(v["may_schedule"] for v in response.json()["nodes"].values()) + # Reconciler cancel API should be a no-op when nothing is in flight + env.storage_controller.request( + "PUT", + f"{env.storage_controller_api}/control/v1/tenant/{tenant_id}-0102/cancel_reconcile", + headers=env.storage_controller.headers(TokenScope.ADMIN), + ) + + # Node unclean drop API response = env.storage_controller.request( "POST", f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop", @@ -879,6 +971,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): ) assert len(env.storage_controller.node_list()) == 1 + # Tenant unclean drop API response = env.storage_controller.request( "POST", f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop", @@ -892,7 +985,6 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder): headers=env.storage_controller.headers(TokenScope.ADMIN), ) assert len(response.json()) == 1 - # Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're # meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind. env.storage_controller.consistency_check() @@ -1660,6 +1752,11 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder): storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"]) assert "Stop" in storcon_cli(["tenants"])[3] + # Cancel ongoing reconcile on a tenant + storcon_cli( + ["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{env.initial_tenant}-0104"] + ) + # Change a tenant's placement storcon_cli( ["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"] diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index cb8724dd1c..77efd7b749 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -137,14 +137,17 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b } ) - # Create two branches and archive them - parent_timeline_id = env.create_branch("test_ancestor_branch_archive_parent", tenant_id) - leaf_timeline_id = env.create_branch( - "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent" + # Create three branches that depend on each other, starting with two + grandparent_timeline_id = env.create_branch( + "test_ancestor_branch_archive_grandparent", tenant_id + ) + parent_timeline_id = env.create_branch( + "test_ancestor_branch_archive_parent", tenant_id, "test_ancestor_branch_archive_grandparent" ) + # write some stuff to the parent with env.endpoints.create_start( - "test_ancestor_branch_archive_branch1", tenant_id=tenant_id + "test_ancestor_branch_archive_parent", tenant_id=tenant_id ) as endpoint: endpoint.safe_psql_many( [ @@ -154,6 +157,11 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b ) sum = endpoint.safe_psql("SELECT sum(key) from foo where key > 50") + # create the third branch + leaf_timeline_id = env.create_branch( + "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent" + ) + ps_http.timeline_archival_config( tenant_id, leaf_timeline_id, @@ -171,6 +179,12 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b state=TimelineArchivalState.ARCHIVED, ) + ps_http.timeline_archival_config( + tenant_id, + grandparent_timeline_id, + state=TimelineArchivalState.ARCHIVED, + ) + def timeline_offloaded_logged(timeline_id: TimelineId) -> bool: return ( env.pageserver.log_contains(f".*{timeline_id}.* offloading archived timeline.*") @@ -201,30 +215,34 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b ps_http.timeline_archival_config( tenant_id, - parent_timeline_id, + grandparent_timeline_id, state=TimelineArchivalState.UNARCHIVED, ) ps_http.timeline_archival_config( tenant_id, - leaf_timeline_id, + parent_timeline_id, state=TimelineArchivalState.UNARCHIVED, ) - leaf_detail = ps_http.timeline_detail( + parent_detail = ps_http.timeline_detail( tenant_id, - leaf_timeline_id, + parent_timeline_id, ) - assert leaf_detail["is_archived"] is False + assert parent_detail["is_archived"] is False with env.endpoints.create_start( - "test_ancestor_branch_archive_branch1", tenant_id=tenant_id + "test_ancestor_branch_archive_parent", tenant_id=tenant_id ) as endpoint: sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key > 50") assert sum == sum_again + # Test that deletion of offloaded timelines works + ps_http.timeline_delete(tenant_id, leaf_timeline_id) + assert not timeline_offloaded_logged(initial_timeline_id) -def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder): +@pytest.mark.parametrize("delete_timeline", [False, True]) +def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timeline: bool): """ Test for persistence of timeline offload state """ @@ -306,27 +324,35 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder): assert timeline_offloaded_api(child_timeline_id) assert not timeline_offloaded_api(root_timeline_id) - ps_http.timeline_archival_config( - tenant_id, - child_timeline_id, - state=TimelineArchivalState.UNARCHIVED, - ) - child_detail = ps_http.timeline_detail( - tenant_id, - child_timeline_id, - ) - assert child_detail["is_archived"] is False + if delete_timeline: + ps_http.timeline_delete(tenant_id, child_timeline_id) + with pytest.raises(PageserverApiException, match="not found"): + ps_http.timeline_detail( + tenant_id, + child_timeline_id, + ) + else: + ps_http.timeline_archival_config( + tenant_id, + child_timeline_id, + state=TimelineArchivalState.UNARCHIVED, + ) + child_detail = ps_http.timeline_detail( + tenant_id, + child_timeline_id, + ) + assert child_detail["is_archived"] is False - with env.endpoints.create_start( - "test_archived_branch_persisted", tenant_id=tenant_id - ) as endpoint: - sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key < 500") - assert sum == sum_again + with env.endpoints.create_start( + "test_archived_branch_persisted", tenant_id=tenant_id + ) as endpoint: + sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key < 500") + assert sum == sum_again - assert_prefix_empty( - neon_env_builder.pageserver_remote_storage, - prefix=f"tenants/{str(env.initial_tenant)}/tenant-manifest", - ) + assert_prefix_empty( + neon_env_builder.pageserver_remote_storage, + prefix=f"tenants/{str(env.initial_tenant)}/tenant-manifest", + ) assert not timeline_offloaded_api(root_timeline_id) diff --git a/test_runner/regress/test_unstable_extensions.py b/test_runner/regress/test_unstable_extensions.py new file mode 100644 index 0000000000..06a62ccfd8 --- /dev/null +++ b/test_runner/regress/test_unstable_extensions.py @@ -0,0 +1,50 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, cast + +import pytest +from psycopg2.errors import InsufficientPrivilege + +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv + + +def test_unstable_extensions_installation(neon_simple_env: NeonEnv): + """ + Test that the unstable extension support within the neon extension can + block extension installation. + """ + env = neon_simple_env + + neon_unstable_extensions = "pg_prewarm,amcheck" + + endpoint = env.endpoints.create( + "main", + config_lines=[ + "neon.allow_unstable_extensions=false", + f"neon.unstable_extensions='{neon_unstable_extensions}'", + ], + ) + endpoint.respec(skip_pg_catalog_updates=False) + endpoint.start() + + with endpoint.cursor() as cursor: + cursor.execute("SELECT current_setting('neon.unstable_extensions')") + result = cursor.fetchone() + assert result is not None + setting = cast("str", result[0]) + assert setting == neon_unstable_extensions + + with pytest.raises(InsufficientPrivilege): + cursor.execute("CREATE EXTENSION pg_prewarm") + + with pytest.raises(InsufficientPrivilege): + cursor.execute("CREATE EXTENSION amcheck") + + # Make sure that we can install a "stable" extension + cursor.execute("CREATE EXTENSION pageinspect") + + cursor.execute("BEGIN") + cursor.execute("SET neon.allow_unstable_extensions TO true") + cursor.execute("CREATE EXTENSION pg_prewarm") + cursor.execute("COMMIT") diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index d803cd7c78..157390c01c 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1998,6 +1998,109 @@ def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder): pt_handle.join() +def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder): + """ + Verify that when pull_timeline is used on an evicted timeline, it does not result in + promoting any segments to local disk on the source, and the timeline is correctly instantiated + in evicted state on the destination. This behavior is important to avoid ballooning disk + usage when doing mass migration of timelines. + """ + neon_env_builder.num_safekeepers = 4 + neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage()) + + # Configure safekeepers with ultra-fast eviction policy + neon_env_builder.safekeeper_extra_opts = [ + "--enable-offload", + "--partial-backup-timeout", + "50ms", + "--control-file-save-interval", + "1s", + # Safekeepers usually wait a while before evicting something: for this test we want them to + # evict things as soon as they are inactive. + "--eviction-min-resident=100ms", + "--delete-offloaded-wal", + ] + + initial_tenant_conf = {"lagging_wal_timeout": "1s", "checkpoint_timeout": "100ms"} + env = neon_env_builder.init_start(initial_tenant_conf=initial_tenant_conf) + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + + (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[-1]) + log.info(f"Will pull_timeline on destination {dst_sk.id} from source {src_sk.id}") + + ep = env.endpoints.create("main") + ep.active_safekeepers = [s.id for s in env.safekeepers if s.id != dst_sk.id] + log.info(f"Compute writing initially to safekeepers: {ep.active_safekeepers}") + ep.active_safekeepers = [1, 2, 3] # Exclude dst_sk from set written by compute initially + ep.start() + ep.safe_psql("CREATE TABLE t(i int)") + ep.safe_psql("INSERT INTO t VALUES (0)") + ep.stop() + + wait_lsn_force_checkpoint_at_sk(src_sk, tenant_id, timeline_id, env.pageserver) + + src_http = src_sk.http_client() + dst_http = dst_sk.http_client() + + def evicted_on_source(): + # Wait for timeline to go into evicted state + assert src_http.get_eviction_state(timeline_id) != "Present" + assert ( + src_http.get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "evict"} + ) + or 0 > 0 + ) + assert src_http.get_metric_value("safekeeper_evicted_timelines") or 0 > 0 + # Check that on source no segment files are present + assert src_sk.list_segments(tenant_id, timeline_id) == [] + + wait_until(60, 1, evicted_on_source) + + # Invoke pull_timeline: source should serve snapshot request without promoting anything to local disk, + # destination should import the control file only & go into evicted mode immediately + dst_sk.pull_timeline([src_sk], tenant_id, timeline_id) + + # Check that on source and destination no segment files are present + assert src_sk.list_segments(tenant_id, timeline_id) == [] + assert dst_sk.list_segments(tenant_id, timeline_id) == [] + + # Check that the timeline on the destination is in the expected evicted state. + evicted_on_source() # It should still be evicted on the source + + def evicted_on_destination(): + assert dst_http.get_eviction_state(timeline_id) != "Present" + assert dst_http.get_metric_value("safekeeper_evicted_timelines") or 0 > 0 + + # This should be fast, it is a wait_until because eviction state is updated + # in the background wrt pull_timeline. + wait_until(10, 0.1, evicted_on_destination) + + # Delete the timeline on the source, to prove that deletion works on an + # evicted timeline _and_ that the final compute test is really not using + # the original location + src_sk.http_client().timeline_delete(tenant_id, timeline_id, only_local=True) + + # Check that using the timeline correctly un-evicts it on the new location + ep.active_safekeepers = [2, 3, 4] + ep.start() + ep.safe_psql("INSERT INTO t VALUES (0)") + ep.stop() + + def unevicted_on_dest(): + assert ( + dst_http.get_metric_value( + "safekeeper_eviction_events_completed_total", {"kind": "restore"} + ) + or 0 > 0 + ) + n_evicted = dst_sk.http_client().get_metric_value("safekeeper_evicted_timelines") + assert n_evicted == 0 + + wait_until(10, 1, unevicted_on_dest) + + # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries # when compute is active, but there are no writes to the timeline. In that case # pageserver should maintain a single connection to safekeeper and don't attempt diff --git a/test_runner/stubs/h2/README.md b/test_runner/stubs/h2/README.md new file mode 100644 index 0000000000..cdf181ff80 --- /dev/null +++ b/test_runner/stubs/h2/README.md @@ -0,0 +1 @@ +generated via `poetry run stubgen -p h2 -o test_runner/stubs` diff --git a/test_runner/stubs/h2/__init__.pyi b/test_runner/stubs/h2/__init__.pyi new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test_runner/stubs/h2/config.pyi b/test_runner/stubs/h2/config.pyi new file mode 100644 index 0000000000..710005db69 --- /dev/null +++ b/test_runner/stubs/h2/config.pyi @@ -0,0 +1,42 @@ +from _typeshed import Incomplete + +class _BooleanConfigOption: + name: Incomplete + attr_name: Incomplete + def __init__(self, name) -> None: ... + def __get__(self, instance, owner): ... + def __set__(self, instance, value) -> None: ... + +class DummyLogger: + def __init__(self, *vargs) -> None: ... + def debug(self, *vargs, **kwargs) -> None: ... + def trace(self, *vargs, **kwargs) -> None: ... + +class OutputLogger: + file: Incomplete + trace_level: Incomplete + def __init__(self, file: Incomplete | None = ..., trace_level: bool = ...) -> None: ... + def debug(self, fmtstr, *args) -> None: ... + def trace(self, fmtstr, *args) -> None: ... + +class H2Configuration: + client_side: Incomplete + validate_outbound_headers: Incomplete + normalize_outbound_headers: Incomplete + validate_inbound_headers: Incomplete + normalize_inbound_headers: Incomplete + logger: Incomplete + def __init__( + self, + client_side: bool = ..., + header_encoding: Incomplete | None = ..., + validate_outbound_headers: bool = ..., + normalize_outbound_headers: bool = ..., + validate_inbound_headers: bool = ..., + normalize_inbound_headers: bool = ..., + logger: Incomplete | None = ..., + ) -> None: ... + @property + def header_encoding(self): ... + @header_encoding.setter + def header_encoding(self, value) -> None: ... diff --git a/test_runner/stubs/h2/connection.pyi b/test_runner/stubs/h2/connection.pyi new file mode 100644 index 0000000000..04be18ca74 --- /dev/null +++ b/test_runner/stubs/h2/connection.pyi @@ -0,0 +1,142 @@ +from enum import Enum, IntEnum + +from _typeshed import Incomplete + +from .config import H2Configuration as H2Configuration +from .errors import ErrorCodes as ErrorCodes +from .events import AlternativeServiceAvailable as AlternativeServiceAvailable +from .events import ConnectionTerminated as ConnectionTerminated +from .events import PingAckReceived as PingAckReceived +from .events import PingReceived as PingReceived +from .events import PriorityUpdated as PriorityUpdated +from .events import RemoteSettingsChanged as RemoteSettingsChanged +from .events import SettingsAcknowledged as SettingsAcknowledged +from .events import UnknownFrameReceived as UnknownFrameReceived +from .events import WindowUpdated as WindowUpdated +from .exceptions import DenialOfServiceError as DenialOfServiceError +from .exceptions import FlowControlError as FlowControlError +from .exceptions import FrameTooLargeError as FrameTooLargeError +from .exceptions import NoAvailableStreamIDError as NoAvailableStreamIDError +from .exceptions import NoSuchStreamError as NoSuchStreamError +from .exceptions import ProtocolError as ProtocolError +from .exceptions import RFC1122Error as RFC1122Error +from .exceptions import StreamClosedError as StreamClosedError +from .exceptions import StreamIDTooLowError as StreamIDTooLowError +from .exceptions import TooManyStreamsError as TooManyStreamsError +from .frame_buffer import FrameBuffer as FrameBuffer +from .settings import SettingCodes as SettingCodes +from .settings import Settings as Settings +from .stream import H2Stream as H2Stream +from .stream import StreamClosedBy as StreamClosedBy +from .utilities import guard_increment_window as guard_increment_window +from .windows import WindowManager as WindowManager + +class ConnectionState(Enum): + IDLE: int + CLIENT_OPEN: int + SERVER_OPEN: int + CLOSED: int + +class ConnectionInputs(Enum): + SEND_HEADERS: int + SEND_PUSH_PROMISE: int + SEND_DATA: int + SEND_GOAWAY: int + SEND_WINDOW_UPDATE: int + SEND_PING: int + SEND_SETTINGS: int + SEND_RST_STREAM: int + SEND_PRIORITY: int + RECV_HEADERS: int + RECV_PUSH_PROMISE: int + RECV_DATA: int + RECV_GOAWAY: int + RECV_WINDOW_UPDATE: int + RECV_PING: int + RECV_SETTINGS: int + RECV_RST_STREAM: int + RECV_PRIORITY: int + SEND_ALTERNATIVE_SERVICE: int + RECV_ALTERNATIVE_SERVICE: int + +class AllowedStreamIDs(IntEnum): + EVEN: int + ODD: int + +class H2ConnectionStateMachine: + state: Incomplete + def __init__(self) -> None: ... + def process_input(self, input_): ... + +class H2Connection: + DEFAULT_MAX_OUTBOUND_FRAME_SIZE: int + DEFAULT_MAX_INBOUND_FRAME_SIZE: Incomplete + HIGHEST_ALLOWED_STREAM_ID: Incomplete + MAX_WINDOW_INCREMENT: Incomplete + DEFAULT_MAX_HEADER_LIST_SIZE: Incomplete + MAX_CLOSED_STREAMS: Incomplete + state_machine: Incomplete + streams: Incomplete + highest_inbound_stream_id: int + highest_outbound_stream_id: int + encoder: Incomplete + decoder: Incomplete + config: Incomplete + local_settings: Incomplete + remote_settings: Incomplete + outbound_flow_control_window: Incomplete + max_outbound_frame_size: Incomplete + max_inbound_frame_size: Incomplete + incoming_buffer: Incomplete + def __init__(self, config: Incomplete | None = ...) -> None: ... + @property + def open_outbound_streams(self): ... + @property + def open_inbound_streams(self): ... + @property + def inbound_flow_control_window(self): ... + def initiate_connection(self) -> None: ... + def initiate_upgrade_connection(self, settings_header: Incomplete | None = ...): ... + def get_next_available_stream_id(self): ... + def send_headers( + self, + stream_id, + headers, + end_stream: bool = ..., + priority_weight: Incomplete | None = ..., + priority_depends_on: Incomplete | None = ..., + priority_exclusive: Incomplete | None = ..., + ) -> None: ... + def send_data( + self, stream_id, data, end_stream: bool = ..., pad_length: Incomplete | None = ... + ) -> None: ... + def end_stream(self, stream_id) -> None: ... + def increment_flow_control_window( + self, increment, stream_id: Incomplete | None = ... + ) -> None: ... + def push_stream(self, stream_id, promised_stream_id, request_headers) -> None: ... + def ping(self, opaque_data) -> None: ... + def reset_stream(self, stream_id, error_code: int = ...) -> None: ... + def close_connection( + self, + error_code: int = ..., + additional_data: Incomplete | None = ..., + last_stream_id: Incomplete | None = ..., + ) -> None: ... + def update_settings(self, new_settings) -> None: ... + def advertise_alternative_service( + self, field_value, origin: Incomplete | None = ..., stream_id: Incomplete | None = ... + ) -> None: ... + def prioritize( + self, + stream_id, + weight: Incomplete | None = ..., + depends_on: Incomplete | None = ..., + exclusive: Incomplete | None = ..., + ) -> None: ... + def local_flow_control_window(self, stream_id): ... + def remote_flow_control_window(self, stream_id): ... + def acknowledge_received_data(self, acknowledged_size, stream_id) -> None: ... + def data_to_send(self, amount: Incomplete | None = ...): ... + def clear_outbound_data_buffer(self) -> None: ... + def receive_data(self, data): ... diff --git a/test_runner/stubs/h2/errors.pyi b/test_runner/stubs/h2/errors.pyi new file mode 100644 index 0000000000..b70c632f8c --- /dev/null +++ b/test_runner/stubs/h2/errors.pyi @@ -0,0 +1,17 @@ +import enum + +class ErrorCodes(enum.IntEnum): + NO_ERROR: int + PROTOCOL_ERROR: int + INTERNAL_ERROR: int + FLOW_CONTROL_ERROR: int + SETTINGS_TIMEOUT: int + STREAM_CLOSED: int + FRAME_SIZE_ERROR: int + REFUSED_STREAM: int + CANCEL: int + COMPRESSION_ERROR: int + CONNECT_ERROR: int + ENHANCE_YOUR_CALM: int + INADEQUATE_SECURITY: int + HTTP_1_1_REQUIRED: int diff --git a/test_runner/stubs/h2/events.pyi b/test_runner/stubs/h2/events.pyi new file mode 100644 index 0000000000..75d0a9e53b --- /dev/null +++ b/test_runner/stubs/h2/events.pyi @@ -0,0 +1,106 @@ +from _typeshed import Incomplete + +from .settings import ChangedSetting as ChangedSetting + +class Event: ... + +class RequestReceived(Event): + stream_id: Incomplete + headers: Incomplete + stream_ended: Incomplete + priority_updated: Incomplete + def __init__(self) -> None: ... + +class ResponseReceived(Event): + stream_id: Incomplete + headers: Incomplete + stream_ended: Incomplete + priority_updated: Incomplete + def __init__(self) -> None: ... + +class TrailersReceived(Event): + stream_id: Incomplete + headers: Incomplete + stream_ended: Incomplete + priority_updated: Incomplete + def __init__(self) -> None: ... + +class _HeadersSent(Event): ... +class _ResponseSent(_HeadersSent): ... +class _RequestSent(_HeadersSent): ... +class _TrailersSent(_HeadersSent): ... +class _PushedRequestSent(_HeadersSent): ... + +class InformationalResponseReceived(Event): + stream_id: Incomplete + headers: Incomplete + priority_updated: Incomplete + def __init__(self) -> None: ... + +class DataReceived(Event): + stream_id: Incomplete + data: Incomplete + flow_controlled_length: Incomplete + stream_ended: Incomplete + def __init__(self) -> None: ... + +class WindowUpdated(Event): + stream_id: Incomplete + delta: Incomplete + def __init__(self) -> None: ... + +class RemoteSettingsChanged(Event): + changed_settings: Incomplete + def __init__(self) -> None: ... + @classmethod + def from_settings(cls, old_settings, new_settings): ... + +class PingReceived(Event): + ping_data: Incomplete + def __init__(self) -> None: ... + +class PingAckReceived(Event): + ping_data: Incomplete + def __init__(self) -> None: ... + +class StreamEnded(Event): + stream_id: Incomplete + def __init__(self) -> None: ... + +class StreamReset(Event): + stream_id: Incomplete + error_code: Incomplete + remote_reset: bool + def __init__(self) -> None: ... + +class PushedStreamReceived(Event): + pushed_stream_id: Incomplete + parent_stream_id: Incomplete + headers: Incomplete + def __init__(self) -> None: ... + +class SettingsAcknowledged(Event): + changed_settings: Incomplete + def __init__(self) -> None: ... + +class PriorityUpdated(Event): + stream_id: Incomplete + weight: Incomplete + depends_on: Incomplete + exclusive: Incomplete + def __init__(self) -> None: ... + +class ConnectionTerminated(Event): + error_code: Incomplete + last_stream_id: Incomplete + additional_data: Incomplete + def __init__(self) -> None: ... + +class AlternativeServiceAvailable(Event): + origin: Incomplete + field_value: Incomplete + def __init__(self) -> None: ... + +class UnknownFrameReceived(Event): + frame: Incomplete + def __init__(self) -> None: ... diff --git a/test_runner/stubs/h2/exceptions.pyi b/test_runner/stubs/h2/exceptions.pyi new file mode 100644 index 0000000000..82019d5ec1 --- /dev/null +++ b/test_runner/stubs/h2/exceptions.pyi @@ -0,0 +1,48 @@ +from _typeshed import Incomplete + +class H2Error(Exception): ... + +class ProtocolError(H2Error): + error_code: Incomplete + +class FrameTooLargeError(ProtocolError): + error_code: Incomplete + +class FrameDataMissingError(ProtocolError): + error_code: Incomplete + +class TooManyStreamsError(ProtocolError): ... + +class FlowControlError(ProtocolError): + error_code: Incomplete + +class StreamIDTooLowError(ProtocolError): + stream_id: Incomplete + max_stream_id: Incomplete + def __init__(self, stream_id, max_stream_id) -> None: ... + +class NoAvailableStreamIDError(ProtocolError): ... + +class NoSuchStreamError(ProtocolError): + stream_id: Incomplete + def __init__(self, stream_id) -> None: ... + +class StreamClosedError(NoSuchStreamError): + stream_id: Incomplete + error_code: Incomplete + def __init__(self, stream_id) -> None: ... + +class InvalidSettingsValueError(ProtocolError, ValueError): + error_code: Incomplete + def __init__(self, msg, error_code) -> None: ... + +class InvalidBodyLengthError(ProtocolError): + expected_length: Incomplete + actual_length: Incomplete + def __init__(self, expected, actual) -> None: ... + +class UnsupportedFrameError(ProtocolError): ... +class RFC1122Error(H2Error): ... + +class DenialOfServiceError(ProtocolError): + error_code: Incomplete diff --git a/test_runner/stubs/h2/frame_buffer.pyi b/test_runner/stubs/h2/frame_buffer.pyi new file mode 100644 index 0000000000..f47adab704 --- /dev/null +++ b/test_runner/stubs/h2/frame_buffer.pyi @@ -0,0 +1,19 @@ +from .exceptions import ( + FrameDataMissingError as FrameDataMissingError, +) +from .exceptions import ( + FrameTooLargeError as FrameTooLargeError, +) +from .exceptions import ( + ProtocolError as ProtocolError, +) + +CONTINUATION_BACKLOG: int + +class FrameBuffer: + data: bytes + max_frame_size: int + def __init__(self, server: bool = ...) -> None: ... + def add_data(self, data) -> None: ... + def __iter__(self): ... + def __next__(self): ... diff --git a/test_runner/stubs/h2/settings.pyi b/test_runner/stubs/h2/settings.pyi new file mode 100644 index 0000000000..a352abe53e --- /dev/null +++ b/test_runner/stubs/h2/settings.pyi @@ -0,0 +1,61 @@ +import enum +from collections.abc import MutableMapping +from typing import Any + +from _typeshed import Incomplete +from h2.errors import ErrorCodes as ErrorCodes +from h2.exceptions import InvalidSettingsValueError as InvalidSettingsValueError + +class SettingCodes(enum.IntEnum): + HEADER_TABLE_SIZE: Incomplete + ENABLE_PUSH: Incomplete + MAX_CONCURRENT_STREAMS: Incomplete + INITIAL_WINDOW_SIZE: Incomplete + MAX_FRAME_SIZE: Incomplete + MAX_HEADER_LIST_SIZE: Incomplete + ENABLE_CONNECT_PROTOCOL: Incomplete + +class ChangedSetting: + setting: Incomplete + original_value: Incomplete + new_value: Incomplete + def __init__(self, setting, original_value, new_value) -> None: ... + +class Settings(MutableMapping[str, Any]): + def __init__(self, client: bool = ..., initial_values: Incomplete | None = ...) -> None: ... + def acknowledge(self): ... + @property + def header_table_size(self): ... + @header_table_size.setter + def header_table_size(self, value) -> None: ... + @property + def enable_push(self): ... + @enable_push.setter + def enable_push(self, value) -> None: ... + @property + def initial_window_size(self): ... + @initial_window_size.setter + def initial_window_size(self, value) -> None: ... + @property + def max_frame_size(self): ... + @max_frame_size.setter + def max_frame_size(self, value) -> None: ... + @property + def max_concurrent_streams(self): ... + @max_concurrent_streams.setter + def max_concurrent_streams(self, value) -> None: ... + @property + def max_header_list_size(self): ... + @max_header_list_size.setter + def max_header_list_size(self, value) -> None: ... + @property + def enable_connect_protocol(self): ... + @enable_connect_protocol.setter + def enable_connect_protocol(self, value) -> None: ... + def __getitem__(self, key): ... + def __setitem__(self, key, value) -> None: ... + def __delitem__(self, key) -> None: ... + def __iter__(self): ... + def __len__(self) -> int: ... + def __eq__(self, other): ... + def __ne__(self, other): ... diff --git a/test_runner/stubs/h2/stream.pyi b/test_runner/stubs/h2/stream.pyi new file mode 100644 index 0000000000..d52ab8e72b --- /dev/null +++ b/test_runner/stubs/h2/stream.pyi @@ -0,0 +1,184 @@ +from enum import Enum, IntEnum + +from _typeshed import Incomplete + +from .errors import ErrorCodes as ErrorCodes +from .events import ( + AlternativeServiceAvailable as AlternativeServiceAvailable, +) +from .events import ( + DataReceived as DataReceived, +) +from .events import ( + InformationalResponseReceived as InformationalResponseReceived, +) +from .events import ( + PushedStreamReceived as PushedStreamReceived, +) +from .events import ( + RequestReceived as RequestReceived, +) +from .events import ( + ResponseReceived as ResponseReceived, +) +from .events import ( + StreamEnded as StreamEnded, +) +from .events import ( + StreamReset as StreamReset, +) +from .events import ( + TrailersReceived as TrailersReceived, +) +from .events import ( + WindowUpdated as WindowUpdated, +) +from .exceptions import ( + FlowControlError as FlowControlError, +) +from .exceptions import ( + InvalidBodyLengthError as InvalidBodyLengthError, +) +from .exceptions import ( + ProtocolError as ProtocolError, +) +from .exceptions import ( + StreamClosedError as StreamClosedError, +) +from .utilities import ( + HeaderValidationFlags as HeaderValidationFlags, +) +from .utilities import ( + authority_from_headers as authority_from_headers, +) +from .utilities import ( + extract_method_header as extract_method_header, +) +from .utilities import ( + guard_increment_window as guard_increment_window, +) +from .utilities import ( + is_informational_response as is_informational_response, +) +from .utilities import ( + normalize_inbound_headers as normalize_inbound_headers, +) +from .utilities import ( + normalize_outbound_headers as normalize_outbound_headers, +) +from .utilities import ( + validate_headers as validate_headers, +) +from .utilities import ( + validate_outbound_headers as validate_outbound_headers, +) +from .windows import WindowManager as WindowManager + +class StreamState(IntEnum): + IDLE: int + RESERVED_REMOTE: int + RESERVED_LOCAL: int + OPEN: int + HALF_CLOSED_REMOTE: int + HALF_CLOSED_LOCAL: int + CLOSED: int + +class StreamInputs(Enum): + SEND_HEADERS: int + SEND_PUSH_PROMISE: int + SEND_RST_STREAM: int + SEND_DATA: int + SEND_WINDOW_UPDATE: int + SEND_END_STREAM: int + RECV_HEADERS: int + RECV_PUSH_PROMISE: int + RECV_RST_STREAM: int + RECV_DATA: int + RECV_WINDOW_UPDATE: int + RECV_END_STREAM: int + RECV_CONTINUATION: int + SEND_INFORMATIONAL_HEADERS: int + RECV_INFORMATIONAL_HEADERS: int + SEND_ALTERNATIVE_SERVICE: int + RECV_ALTERNATIVE_SERVICE: int + UPGRADE_CLIENT: int + UPGRADE_SERVER: int + +class StreamClosedBy(Enum): + SEND_END_STREAM: int + RECV_END_STREAM: int + SEND_RST_STREAM: int + RECV_RST_STREAM: int + +STREAM_OPEN: Incomplete + +class H2StreamStateMachine: + state: Incomplete + stream_id: Incomplete + client: Incomplete + headers_sent: Incomplete + trailers_sent: Incomplete + headers_received: Incomplete + trailers_received: Incomplete + stream_closed_by: Incomplete + def __init__(self, stream_id) -> None: ... + def process_input(self, input_): ... + def request_sent(self, previous_state): ... + def response_sent(self, previous_state): ... + def request_received(self, previous_state): ... + def response_received(self, previous_state): ... + def data_received(self, previous_state): ... + def window_updated(self, previous_state): ... + def stream_half_closed(self, previous_state): ... + def stream_ended(self, previous_state): ... + def stream_reset(self, previous_state): ... + def send_new_pushed_stream(self, previous_state): ... + def recv_new_pushed_stream(self, previous_state): ... + def send_push_promise(self, previous_state): ... + def recv_push_promise(self, previous_state): ... + def send_end_stream(self, previous_state) -> None: ... + def send_reset_stream(self, previous_state) -> None: ... + def reset_stream_on_error(self, previous_state) -> None: ... + def recv_on_closed_stream(self, previous_state) -> None: ... + def send_on_closed_stream(self, previous_state) -> None: ... + def recv_push_on_closed_stream(self, previous_state) -> None: ... + def send_push_on_closed_stream(self, previous_state) -> None: ... + def send_informational_response(self, previous_state): ... + def recv_informational_response(self, previous_state): ... + def recv_alt_svc(self, previous_state): ... + def send_alt_svc(self, previous_state) -> None: ... + +class H2Stream: + state_machine: Incomplete + stream_id: Incomplete + max_outbound_frame_size: Incomplete + request_method: Incomplete + outbound_flow_control_window: Incomplete + config: Incomplete + def __init__(self, stream_id, config, inbound_window_size, outbound_window_size) -> None: ... + @property + def inbound_flow_control_window(self): ... + @property + def open(self): ... + @property + def closed(self): ... + @property + def closed_by(self): ... + def upgrade(self, client_side) -> None: ... + def send_headers(self, headers, encoder, end_stream: bool = ...): ... + def push_stream_in_band(self, related_stream_id, headers, encoder): ... + def locally_pushed(self): ... + def send_data(self, data, end_stream: bool = ..., pad_length: Incomplete | None = ...): ... + def end_stream(self): ... + def advertise_alternative_service(self, field_value): ... + def increase_flow_control_window(self, increment): ... + def receive_push_promise_in_band(self, promised_stream_id, headers, header_encoding): ... + def remotely_pushed(self, pushed_headers): ... + def receive_headers(self, headers, end_stream, header_encoding): ... + def receive_data(self, data, end_stream, flow_control_len): ... + def receive_window_update(self, increment): ... + def receive_continuation(self) -> None: ... + def receive_alt_svc(self, frame): ... + def reset_stream(self, error_code: int = ...): ... + def stream_reset(self, frame): ... + def acknowledge_received_data(self, acknowledged_size): ... diff --git a/test_runner/stubs/h2/utilities.pyi b/test_runner/stubs/h2/utilities.pyi new file mode 100644 index 0000000000..e0a8d55d1d --- /dev/null +++ b/test_runner/stubs/h2/utilities.pyi @@ -0,0 +1,25 @@ +from typing import NamedTuple + +from _typeshed import Incomplete + +from .exceptions import FlowControlError as FlowControlError +from .exceptions import ProtocolError as ProtocolError + +UPPER_RE: Incomplete +CONNECTION_HEADERS: Incomplete + +def extract_method_header(headers): ... +def is_informational_response(headers): ... +def guard_increment_window(current, increment): ... +def authority_from_headers(headers): ... + +class HeaderValidationFlags(NamedTuple): + is_client: Incomplete + is_trailer: Incomplete + is_response_header: Incomplete + is_push_promise: Incomplete + +def validate_headers(headers, hdr_validation_flags): ... +def normalize_outbound_headers(headers, hdr_validation_flags): ... +def normalize_inbound_headers(headers, hdr_validation_flags): ... +def validate_outbound_headers(headers, hdr_validation_flags): ... diff --git a/test_runner/stubs/h2/windows.pyi b/test_runner/stubs/h2/windows.pyi new file mode 100644 index 0000000000..7dc78e431c --- /dev/null +++ b/test_runner/stubs/h2/windows.pyi @@ -0,0 +1,13 @@ +from _typeshed import Incomplete + +from .exceptions import FlowControlError as FlowControlError + +LARGEST_FLOW_CONTROL_WINDOW: Incomplete + +class WindowManager: + max_window_size: Incomplete + current_window_size: Incomplete + def __init__(self, max_window_size) -> None: ... + def window_consumed(self, size) -> None: ... + def window_opened(self, size) -> None: ... + def process_bytes(self, size): ...