diff --git a/.dockerignore b/.dockerignore index a6e11805e9..960588b6f2 100644 --- a/.dockerignore +++ b/.dockerignore @@ -21,4 +21,5 @@ !workspace_hack/ !neon_local/ !scripts/ninstall.sh +!scripts/combine_control_files.py !vm-cgconfig.conf diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index ceb6f4aa90..60ccc56738 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -209,4 +209,4 @@ runs: uses: ./.github/actions/allure-report-store with: report-dir: /tmp/test_output/allure/results - unique-key: ${{ inputs.build_type }} + unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }} diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 18dfc458b5..27bad61639 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -955,22 +955,15 @@ jobs: version: [ v14, v15 ] env: - # While on transition period we extract public extensions from compute-node image and custom extensions from extensions image. - # Later all the extensions will be moved to extensions image. - EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest - COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest + EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }} AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }} - S3_BUCKETS: | - ${{ github.ref_name == 'release' && - 'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' || - 'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }} + S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }} steps: - name: Pull postgres-extensions image run: | docker pull ${EXTENSIONS_IMAGE} - docker pull ${COMPUTE_NODE_IMAGE} - name: Create postgres-extensions container id: create-container @@ -978,46 +971,23 @@ jobs: EID=$(docker create ${EXTENSIONS_IMAGE} true) echo "EID=${EID}" >> $GITHUB_OUTPUT - CID=$(docker create ${COMPUTE_NODE_IMAGE} true) - echo "CID=${CID}" >> $GITHUB_OUTPUT - - name: Extract postgres-extensions from container run: | - rm -rf ./extensions-to-upload ./custom-extensions # Just in case + rm -rf ./extensions-to-upload # Just in case + mkdir -p extensions-to-upload - # In compute image we have a bit different directory layout - mkdir -p extensions-to-upload/share - docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension - docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib ./extensions-to-upload/lib - - # Delete Neon extensitons (they always present on compute-node image) - rm -rf ./extensions-to-upload/share/extension/neon* - rm -rf ./extensions-to-upload/lib/neon* - - # Delete leftovers from the extension build step - rm -rf ./extensions-to-upload/lib/pgxs - rm -rf ./extensions-to-upload/lib/pkgconfig - - docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions - for EXT_NAME in $(ls ./custom-extensions); do - mkdir -p ./extensions-to-upload/${EXT_NAME}/share - - mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension - mv ./custom-extensions/${EXT_NAME}/lib ./extensions-to-upload/${EXT_NAME}/lib - done + docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/ + docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/ - name: Upload postgres-extensions to S3 - # TODO: Reenable step after switching to the new extensions format (tar-gzipped + index.json) - if: false run: | - for BUCKET in $(echo ${S3_BUCKETS}); do + for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }} done - name: Cleanup - if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }} + if: ${{ always() && steps.create-container.outputs.EID }} run: | - docker rm ${{ steps.create-container.outputs.CID }} || true docker rm ${{ steps.create-container.outputs.EID }} || true deploy: diff --git a/Cargo.lock b/Cargo.lock index 506e5f6c7c..dbeeb539d4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2782,7 +2782,7 @@ dependencies = [ [[package]] name = "postgres" version = "0.19.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc" dependencies = [ "bytes", "fallible-iterator", @@ -2795,7 +2795,7 @@ dependencies = [ [[package]] name = "postgres-native-tls" version = "0.5.0" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc" dependencies = [ "native-tls", "tokio", @@ -2806,7 +2806,7 @@ dependencies = [ [[package]] name = "postgres-protocol" version = "0.6.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc" dependencies = [ "base64 0.20.0", "byteorder", @@ -2824,7 +2824,7 @@ dependencies = [ [[package]] name = "postgres-types" version = "0.2.4" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc" dependencies = [ "bytes", "fallible-iterator", @@ -4314,7 +4314,7 @@ dependencies = [ [[package]] name = "tokio-postgres" version = "0.7.7" -source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f" +source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc" dependencies = [ "async-trait", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 44d49d95e8..a0acc061fb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -144,11 +144,11 @@ env_logger = "0.10" log = "0.4" ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed -postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" } -postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" } -postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" } -postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" } -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" } +postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" } +postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" } +postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" } +postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" } ## Other git libraries heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending @@ -183,7 +183,7 @@ tonic-build = "0.9" # This is only needed for proxy's tests. # TODO: we should probably fork `tokio-postgres-rustls` instead. -tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" } +tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" } ################# Binary contents sections diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 495ef25526..9759faf733 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -13,7 +13,7 @@ FROM debian:bullseye-slim AS build-deps RUN apt update && \ apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \ zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \ - libicu-dev libxslt1-dev liblz4-dev libzstd-dev + libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd ######################################################################################### # @@ -77,6 +77,7 @@ ENV PATH "/usr/local/pgsql/bin:$PATH" RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \ echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \ mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \ + find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ ./autogen.sh && \ ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -89,17 +90,28 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postg echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control + echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \ + mkdir -p /extensions/postgis && \ + cp /usr/local/pgsql/share/extension/postgis.control /extensions/postgis && \ + cp /usr/local/pgsql/share/extension/postgis_raster.control /extensions/postgis && \ + cp /usr/local/pgsql/share/extension/postgis_sfcgal.control /extensions/postgis && \ + cp /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control /extensions/postgis && \ + cp /usr/local/pgsql/share/extension/postgis_topology.control /extensions/postgis && \ + cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \ + cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \ echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \ mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \ - mkdir build && \ - cd build && \ + mkdir build && cd build && \ cmake -DCMAKE_BUILD_TYPE=Release .. && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control + echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \ + find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\ + cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \ + sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \ + comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T - ######################################################################################### # @@ -419,12 +431,16 @@ RUN apt-get update && \ wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \ echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \ mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \ - mkdir build && \ - cd build && \ + find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ + mkdir build && cd build && \ cmake -DCMAKE_BUILD_TYPE=Release .. && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ - echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control + echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \ + find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\ + mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \ + sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \ + comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T - ######################################################################################### # @@ -535,10 +551,8 @@ FROM build-deps AS pg-embedding-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ ENV PATH "/usr/local/pgsql/bin/:$PATH" -# eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703 made on 15/07/2023 -# There is no release tag yet -RUN wget https://github.com/neondatabase/pg_embedding/archive/eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703.tar.gz -O pg_embedding.tar.gz && \ - echo "030846df723652f99a8689ce63b66fa0c23477a7fd723533ab8a6b28ab70730f pg_embedding.tar.gz" | sha256sum --check && \ +RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.1.tar.gz -O pg_embedding.tar.gz && \ + echo "c4ae84eef36fa8ec5868f6e061f39812f19ee5ba3604d428d40935685c7be512 pg_embedding.tar.gz" | sha256sum --check && \ mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \ make -j $(getconf _NPROCESSORS_ONLN) && \ make -j $(getconf _NPROCESSORS_ONLN) install && \ @@ -553,16 +567,17 @@ RUN wget https://github.com/neondatabase/pg_embedding/archive/eeb3ba7c3a60c95b26 FROM build-deps AS pg-anon-pg-build COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ -# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead ENV PATH "/usr/local/pgsql/bin/:$PATH" RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \ echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \ mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \ - find /usr/local/pgsql -type f | sort > /before.txt && \ + find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\ make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \ echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \ - find /usr/local/pgsql -type f | sort > /after.txt && \ - /bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done' + find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\ + mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \ + sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \ + comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T - ######################################################################################### # @@ -754,16 +769,23 @@ RUN rm /usr/local/pgsql/lib/lib*.a # Extenstion only # ######################################################################################### +FROM python:3.9-slim-bullseye AS generate-ext-index +ARG PG_VERSION +ARG BUILD_TAG +RUN apt update && apt install -y zstd + +# copy the control files here +COPY --from=kq-imcx-pg-build /extensions/ /extensions/ +COPY --from=pg-anon-pg-build /extensions/ /extensions/ +COPY --from=postgis-build /extensions/ /extensions/ +COPY scripts/combine_control_files.py ./combine_control_files.py +RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis" + FROM scratch AS postgres-extensions # After the transition this layer will include all extensitons. -# As for now, it's only for new custom ones -# -# # Default extensions -# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension -# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib /usr/local/pgsql/lib -# Custom extensions -COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib -COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension +# As for now, it's only a couple for testing purposses +COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/ +COPY --from=generate-ext-index /ext_index.json /ext_index.json ######################################################################################### # diff --git a/Makefile b/Makefile index ae979b8b4c..0768b64502 100644 --- a/Makefile +++ b/Makefile @@ -108,6 +108,8 @@ postgres-%: postgres-configure-% \ $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install +@echo "Compiling pageinspect $*" $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install + +@echo "Compiling amcheck $*" + $(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install .PHONY: postgres-clean-% postgres-clean-%: diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 4953e5f331..9d15a203e5 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -193,6 +193,13 @@ fn main() -> Result<()> { if !spec_set { // No spec provided, hang waiting for it. info!("no compute spec provided, waiting"); + + // TODO this can stall startups in the unlikely event that we bind + // this compute node while it's busy prewarming. It's not too + // bad because it's just 100ms and unlikely, but it's an + // avoidable problem. + compute.prewarm_postgres()?; + let mut state = compute.state.lock().unwrap(); while state.status != ComputeStatus::ConfigurationPending { state = compute.state_changed.wait(state).unwrap(); diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index b33f4f05dd..254d367a71 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -8,9 +8,11 @@ use std::sync::{Condvar, Mutex}; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; +use futures::stream::FuturesUnordered; +use futures::StreamExt; use postgres::{Client, NoTls}; use tokio_postgres; -use tracing::{info, instrument, warn}; +use tracing::{error, info, instrument, warn}; use utils::id::{TenantId, TimelineId}; use utils::lsn::Lsn; @@ -21,6 +23,7 @@ use utils::measured_stream::MeasuredReader; use crate::config; use crate::pg_helpers::*; use crate::spec::*; +use crate::sync_sk::{check_if_synced, ping_safekeeper}; /// Compute node info shared across several `compute_ctl` threads. pub struct ComputeNode { @@ -86,6 +89,7 @@ pub struct ParsedSpec { pub tenant_id: TenantId, pub timeline_id: TimelineId, pub pageserver_connstr: String, + pub safekeeper_connstrings: Vec, pub storage_auth_token: Option, } @@ -103,6 +107,21 @@ impl TryFrom for ParsedSpec { .clone() .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring")) .ok_or("pageserver connstr should be provided")?; + let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() { + if matches!(spec.mode, ComputeMode::Primary) { + spec.cluster + .settings + .find("neon.safekeepers") + .ok_or("safekeeper connstrings should be provided")? + .split(',') + .map(|str| str.to_string()) + .collect() + } else { + vec![] + } + } else { + spec.safekeeper_connstrings.clone() + }; let storage_auth_token = spec.storage_auth_token.clone(); let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id { tenant_id @@ -128,6 +147,7 @@ impl TryFrom for ParsedSpec { Ok(ParsedSpec { spec, pageserver_connstr, + safekeeper_connstrings, storage_auth_token, tenant_id, timeline_id, @@ -309,6 +329,102 @@ impl ComputeNode { Ok(()) } + pub async fn check_safekeepers_synced_async( + &self, + compute_state: &ComputeState, + ) -> Result> { + // Construct a connection config for each safekeeper + let pspec: ParsedSpec = compute_state + .pspec + .as_ref() + .expect("spec must be set") + .clone(); + let sk_connstrs: Vec = pspec.safekeeper_connstrings.clone(); + let sk_configs = sk_connstrs.into_iter().map(|connstr| { + // Format connstr + let id = connstr.clone(); + let connstr = format!("postgresql://no_user@{}", connstr); + let options = format!( + "-c timeline_id={} tenant_id={}", + pspec.timeline_id, pspec.tenant_id + ); + + // Construct client + let mut config = tokio_postgres::Config::from_str(&connstr).unwrap(); + config.options(&options); + if let Some(storage_auth_token) = pspec.storage_auth_token.clone() { + config.password(storage_auth_token); + } + + (id, config) + }); + + // Create task set to query all safekeepers + let mut tasks = FuturesUnordered::new(); + let quorum = sk_configs.len() / 2 + 1; + for (id, config) in sk_configs { + let timeout = tokio::time::Duration::from_millis(100); + let task = tokio::time::timeout(timeout, ping_safekeeper(id, config)); + tasks.push(tokio::spawn(task)); + } + + // Get a quorum of responses or errors + let mut responses = Vec::new(); + let mut join_errors = Vec::new(); + let mut task_errors = Vec::new(); + let mut timeout_errors = Vec::new(); + while let Some(response) = tasks.next().await { + match response { + Ok(Ok(Ok(r))) => responses.push(r), + Ok(Ok(Err(e))) => task_errors.push(e), + Ok(Err(e)) => timeout_errors.push(e), + Err(e) => join_errors.push(e), + }; + if responses.len() >= quorum { + break; + } + if join_errors.len() + task_errors.len() + timeout_errors.len() >= quorum { + break; + } + } + + // In case of error, log and fail the check, but don't crash. + // We're playing it safe because these errors could be transient + // and we don't yet retry. Also being careful here allows us to + // be backwards compatible with safekeepers that don't have the + // TIMELINE_STATUS API yet. + if responses.len() < quorum { + error!( + "failed sync safekeepers check {:?} {:?} {:?}", + join_errors, task_errors, timeout_errors + ); + return Ok(None); + } + + Ok(check_if_synced(responses)) + } + + // Fast path for sync_safekeepers. If they're already synced we get the lsn + // in one roundtrip. If not, we should do a full sync_safekeepers. + pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result> { + let start_time = Utc::now(); + + // Run actual work with new tokio runtime + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .expect("failed to create rt"); + let result = rt.block_on(self.check_safekeepers_synced_async(compute_state)); + + // Record runtime + self.state.lock().unwrap().metrics.sync_sk_check_ms = Utc::now() + .signed_duration_since(start_time) + .to_std() + .unwrap() + .as_millis() as u64; + result + } + // Run `postgres` in a special mode with `--sync-safekeepers` argument // and return the reported LSN back to the caller. #[instrument(skip_all)] @@ -371,10 +487,14 @@ impl ComputeNode { // cannot sync safekeepers. let lsn = match spec.mode { ComputeMode::Primary => { - info!("starting safekeepers syncing"); - let lsn = self - .sync_safekeepers(pspec.storage_auth_token.clone()) - .with_context(|| "failed to sync safekeepers")?; + info!("checking if safekeepers are synced"); + let lsn = if let Ok(Some(lsn)) = self.check_safekeepers_synced(compute_state) { + lsn + } else { + info!("starting safekeepers syncing"); + self.sync_safekeepers(pspec.storage_auth_token.clone()) + .with_context(|| "failed to sync safekeepers")? + }; info!("safekeepers synced at LSN {}", lsn); lsn } @@ -412,6 +532,50 @@ impl ComputeNode { Ok(()) } + /// Start and stop a postgres process to warm up the VM for startup. + pub fn prewarm_postgres(&self) -> Result<()> { + info!("prewarming"); + + // Create pgdata + let pgdata = &format!("{}.warmup", self.pgdata); + create_pgdata(pgdata)?; + + // Run initdb to completion + info!("running initdb"); + let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb"); + Command::new(initdb_bin) + .args(["-D", pgdata]) + .output() + .expect("cannot start initdb process"); + + // Write conf + use std::io::Write; + let conf_path = Path::new(pgdata).join("postgresql.conf"); + let mut file = std::fs::File::create(conf_path)?; + writeln!(file, "shared_buffers=65536")?; + writeln!(file, "port=51055")?; // Nobody should be connecting + writeln!(file, "shared_preload_libraries = 'neon'")?; + + // Start postgres + info!("starting postgres"); + let mut pg = Command::new(&self.pgbin) + .args(["-D", pgdata]) + .spawn() + .expect("cannot start postgres process"); + + // Stop it when it's ready + info!("waiting for postgres"); + wait_for_postgres(&mut pg, Path::new(pgdata))?; + pg.kill()?; + info!("sent kill signal"); + pg.wait()?; + info!("done prewarming"); + + // clean up + let _ok = fs::remove_dir_all(pgdata); + Ok(()) + } + /// Start Postgres as a child process and manage DBs/roles. /// After that this will hang waiting on the postmaster process to exit. #[instrument(skip_all)] diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 24811f75ee..1d7b09f095 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -13,3 +13,4 @@ pub mod monitor; pub mod params; pub mod pg_helpers; pub mod spec; +pub mod sync_sk; diff --git a/compute_tools/src/sync_sk.rs b/compute_tools/src/sync_sk.rs new file mode 100644 index 0000000000..22b7027b93 --- /dev/null +++ b/compute_tools/src/sync_sk.rs @@ -0,0 +1,98 @@ +// Utils for running sync_safekeepers +use anyhow::Result; +use tracing::info; +use utils::lsn::Lsn; + +#[derive(Copy, Clone, Debug)] +pub enum TimelineStatusResponse { + NotFound, + Ok(TimelineStatusOkResponse), +} + +#[derive(Copy, Clone, Debug)] +pub struct TimelineStatusOkResponse { + flush_lsn: Lsn, + commit_lsn: Lsn, +} + +/// Get a safekeeper's metadata for our timeline. The id is only used for logging +pub async fn ping_safekeeper( + id: String, + config: tokio_postgres::Config, +) -> Result { + // TODO add retries + + // Connect + info!("connecting to {}", id); + let (client, conn) = config.connect(tokio_postgres::NoTls).await?; + tokio::spawn(async move { + if let Err(e) = conn.await { + eprintln!("connection error: {}", e); + } + }); + + // Query + info!("querying {}", id); + let result = client.simple_query("TIMELINE_STATUS").await?; + + // Parse result + info!("done with {}", id); + if let postgres::SimpleQueryMessage::Row(row) = &result[0] { + use std::str::FromStr; + let response = TimelineStatusResponse::Ok(TimelineStatusOkResponse { + flush_lsn: Lsn::from_str(row.get("flush_lsn").unwrap())?, + commit_lsn: Lsn::from_str(row.get("commit_lsn").unwrap())?, + }); + Ok(response) + } else { + // Timeline doesn't exist + Ok(TimelineStatusResponse::NotFound) + } +} + +/// Given a quorum of responses, check if safekeepers are synced at some Lsn +pub fn check_if_synced(responses: Vec) -> Option { + // Check if all responses are ok + let ok_responses: Vec = responses + .iter() + .filter_map(|r| match r { + TimelineStatusResponse::Ok(ok_response) => Some(ok_response), + _ => None, + }) + .cloned() + .collect(); + if ok_responses.len() < responses.len() { + info!( + "not synced. Only {} out of {} know about this timeline", + ok_responses.len(), + responses.len() + ); + return None; + } + + // Get the min and the max of everything + let commit: Vec = ok_responses.iter().map(|r| r.commit_lsn).collect(); + let flush: Vec = ok_responses.iter().map(|r| r.flush_lsn).collect(); + let commit_max = commit.iter().max().unwrap(); + let commit_min = commit.iter().min().unwrap(); + let flush_max = flush.iter().max().unwrap(); + let flush_min = flush.iter().min().unwrap(); + + // Check that all values are equal + if commit_min != commit_max { + info!("not synced. {:?} {:?}", commit_min, commit_max); + return None; + } + if flush_min != flush_max { + info!("not synced. {:?} {:?}", flush_min, flush_max); + return None; + } + + // Check that commit == flush + if commit_max != flush_max { + info!("not synced. {:?} {:?}", commit_max, flush_max); + return None; + } + + Some(*commit_max) +} diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 6df6e47f29..8b5c88bf01 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -564,9 +564,7 @@ impl Endpoint { } Err(e) => { if attempt == MAX_ATTEMPTS { - return Err(e).context( - "timed out waiting to connect to compute_ctl HTTP; last error: {e}", - ); + return Err(e).context("timed out waiting to connect to compute_ctl HTTP"); } } } diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index 6124c81f50..f2865b71ec 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -70,6 +70,7 @@ where pub struct ComputeMetrics { pub wait_for_spec_ms: u64, pub sync_safekeepers_ms: u64, + pub sync_sk_check_ms: u64, pub basebackup_ms: u64, pub basebackup_bytes: u64, pub start_postgres_ms: u64, diff --git a/libs/consumption_metrics/src/lib.rs b/libs/consumption_metrics/src/lib.rs index 3aac00662d..3a1b396d63 100644 --- a/libs/consumption_metrics/src/lib.rs +++ b/libs/consumption_metrics/src/lib.rs @@ -5,7 +5,7 @@ use chrono::{DateTime, Utc}; use rand::Rng; use serde::Serialize; -#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)] +#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)] #[serde(tag = "type")] pub enum EventType { #[serde(rename = "absolute")] @@ -17,6 +17,32 @@ pub enum EventType { }, } +impl EventType { + pub fn absolute_time(&self) -> Option<&DateTime> { + use EventType::*; + match self { + Absolute { time } => Some(time), + _ => None, + } + } + + pub fn incremental_timerange(&self) -> Option>> { + // these can most likely be thought of as Range or RangeFull + use EventType::*; + match self { + Incremental { + start_time, + stop_time, + } => Some(start_time..stop_time), + _ => None, + } + } + + pub fn is_incremental(&self) -> bool { + matches!(self, EventType::Incremental { .. }) + } +} + #[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)] pub struct Event { #[serde(flatten)] diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index 5c5e8a9559..809fa5fffd 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -179,7 +179,7 @@ pub struct FeExecuteMessage { #[derive(Debug)] pub struct FeCloseMessage; -/// An error occured while parsing or serializing raw stream into Postgres +/// An error occurred while parsing or serializing raw stream into Postgres /// messages. #[derive(thiserror::Error, Debug)] pub enum ProtocolError { diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 43d818dfb9..37a6bf23e8 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -200,13 +200,17 @@ impl S3Bucket { ) } - fn relative_path_to_s3_object(&self, path: &RemotePath) -> String { - let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default(); - for segment in path.0.iter() { - full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR); - full_path.push_str(segment.to_str().unwrap_or_default()); + pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String { + assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR); + let path_string = path + .get_path() + .to_string_lossy() + .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR) + .to_string(); + match &self.prefix_in_bucket { + Some(prefix) => prefix.clone() + "/" + &path_string, + None => path_string, } - full_path } async fn download_object(&self, request: GetObjectRequest) -> Result { @@ -427,10 +431,12 @@ impl RemoteStorage for S3Bucket { } async fn download(&self, from: &RemotePath) -> Result { + // if prefix is not none then download file `prefix/from` + // if prefix is none then download file `from` self.download_object(GetObjectRequest { bucket: self.bucket_name.clone(), key: self.relative_path_to_s3_object(from), - ..GetObjectRequest::default() + range: None, }) .await } @@ -523,3 +529,63 @@ impl RemoteStorage for S3Bucket { Ok(()) } } + +#[cfg(test)] +mod tests { + use std::num::NonZeroUsize; + use std::path::Path; + + use crate::{RemotePath, S3Bucket, S3Config}; + + #[test] + fn relative_path() { + let all_paths = vec!["", "some/path", "some/path/"]; + let all_paths: Vec = all_paths + .iter() + .map(|x| RemotePath::new(Path::new(x)).expect("bad path")) + .collect(); + let prefixes = [ + None, + Some(""), + Some("test/prefix"), + Some("test/prefix/"), + Some("/test/prefix/"), + ]; + let expected_outputs = vec![ + vec!["", "some/path", "some/path"], + vec!["/", "/some/path", "/some/path"], + vec![ + "test/prefix/", + "test/prefix/some/path", + "test/prefix/some/path", + ], + vec![ + "test/prefix/", + "test/prefix/some/path", + "test/prefix/some/path", + ], + vec![ + "test/prefix/", + "test/prefix/some/path", + "test/prefix/some/path", + ], + ]; + + for (prefix_idx, prefix) in prefixes.iter().enumerate() { + let config = S3Config { + bucket_name: "bucket".to_owned(), + bucket_region: "region".to_owned(), + prefix_in_bucket: prefix.map(str::to_string), + endpoint: None, + concurrency_limit: NonZeroUsize::new(100).unwrap(), + max_keys_per_list_response: Some(5), + }; + let storage = S3Bucket::new(&config).expect("remote storage init"); + for (test_path_idx, test_path) in all_paths.iter().enumerate() { + let result = storage.relative_path_to_s3_object(test_path); + let expected = expected_outputs[prefix_idx][test_path_idx]; + assert_eq!(result, expected); + } + } + } +} diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index 0917bab39c..982c01a9be 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -19,7 +19,7 @@ static LOGGING_DONE: OnceCell<()> = OnceCell::new(); const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE"; -const BASE_PREFIX: &str = "test/"; +const BASE_PREFIX: &str = "test"; /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries. /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs index 0ef0464267..090912d276 100644 --- a/libs/utils/src/fs_ext.rs +++ b/libs/utils/src/fs_ext.rs @@ -24,12 +24,29 @@ pub async fn is_directory_empty(path: impl AsRef) -> anyhow::Result Ok(dir.next_entry().await?.is_none()) } +pub fn ignore_not_found(e: io::Error) -> io::Result<()> { + if e.kind() == io::ErrorKind::NotFound { + Ok(()) + } else { + Err(e) + } +} + +pub fn ignore_absent_files(fs_operation: F) -> io::Result<()> +where + F: Fn() -> io::Result<()>, +{ + fs_operation().or_else(ignore_not_found) +} + #[cfg(test)] mod test { use std::path::PathBuf; use crate::fs_ext::is_directory_empty; + use super::ignore_absent_files; + #[test] fn is_empty_dir() { use super::PathExt; @@ -75,4 +92,21 @@ mod test { std::fs::remove_file(&file_path).unwrap(); assert!(is_directory_empty(file_path).await.is_err()); } + + #[test] + fn ignore_absent_files_works() { + let dir = tempfile::tempdir().unwrap(); + let dir_path = dir.path(); + + let file_path: PathBuf = dir_path.join("testfile"); + + ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally"); + + let f = std::fs::File::create(&file_path).unwrap(); + drop(f); + + ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally"); + + assert!(!file_path.exists()); + } } diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index 20b601f68d..2ce92ee914 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -1,5 +1,7 @@ +use std::ffi::OsStr; use std::{fmt, str::FromStr}; +use anyhow::Context; use hex::FromHex; use rand::Rng; use serde::{Deserialize, Serialize}; @@ -213,6 +215,18 @@ pub struct TimelineId(Id); id_newtype!(TimelineId); +impl TryFrom> for TimelineId { + type Error = anyhow::Error; + + fn try_from(value: Option<&OsStr>) -> Result { + value + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .with_context(|| format!("Could not parse timeline id from {:?}", value)) + } +} + /// Neon Tenant Id represents identifiar of a particular tenant. /// Is used for distinguishing requests and data belonging to different users. /// diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 4c6df469aa..be806c77ec 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -33,7 +33,8 @@ use crate::tenant::config::TenantConf; use crate::tenant::config::TenantConfOpt; use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME}; use crate::{ - IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX, + IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, + TIMELINE_UNINIT_MARK_SUFFIX, }; pub mod defaults { @@ -601,6 +602,17 @@ impl PageServerConf { ) } + pub fn timeline_delete_mark_file_path( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> PathBuf { + path_with_suffix_extension( + self.timeline_path(&tenant_id, &timeline_id), + TIMELINE_DELETE_MARK_SUFFIX, + ) + } + pub fn traces_path(&self) -> PathBuf { self.workdir.join("traces") } diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index df44300fce..45b4be470b 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -7,7 +7,7 @@ use crate::context::{DownloadBehavior, RequestContext}; use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}; use crate::tenant::{mgr, LogicalSizeCalculationCause}; use anyhow; -use chrono::Utc; +use chrono::{DateTime, Utc}; use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE}; use pageserver_api::models::TenantState; use reqwest::Url; @@ -18,12 +18,6 @@ use std::time::Duration; use tracing::*; use utils::id::{NodeId, TenantId, TimelineId}; -const WRITTEN_SIZE: &str = "written_size"; -const SYNTHETIC_STORAGE_SIZE: &str = "synthetic_storage_size"; -const RESIDENT_SIZE: &str = "resident_size"; -const REMOTE_STORAGE_SIZE: &str = "remote_storage_size"; -const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size"; - const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); #[serde_as] @@ -44,6 +38,121 @@ pub struct PageserverConsumptionMetricsKey { pub metric: &'static str, } +impl PageserverConsumptionMetricsKey { + const fn absolute_values(self) -> AbsoluteValueFactory { + AbsoluteValueFactory(self) + } + const fn incremental_values(self) -> IncrementalValueFactory { + IncrementalValueFactory(self) + } +} + +/// Helper type which each individual metric kind can return to produce only absolute values. +struct AbsoluteValueFactory(PageserverConsumptionMetricsKey); + +impl AbsoluteValueFactory { + fn now(self, val: u64) -> (PageserverConsumptionMetricsKey, (EventType, u64)) { + let key = self.0; + let time = Utc::now(); + (key, (EventType::Absolute { time }, val)) + } +} + +/// Helper type which each individual metric kind can return to produce only incremental values. +struct IncrementalValueFactory(PageserverConsumptionMetricsKey); + +impl IncrementalValueFactory { + #[allow(clippy::wrong_self_convention)] + fn from_previous_up_to( + self, + prev_end: DateTime, + up_to: DateTime, + val: u64, + ) -> (PageserverConsumptionMetricsKey, (EventType, u64)) { + let key = self.0; + // cannot assert prev_end < up_to because these are realtime clock based + ( + key, + ( + EventType::Incremental { + start_time: prev_end, + stop_time: up_to, + }, + val, + ), + ) + } + + fn key(&self) -> &PageserverConsumptionMetricsKey { + &self.0 + } +} + +// the static part of a PageserverConsumptionMetricsKey +impl PageserverConsumptionMetricsKey { + const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory { + PageserverConsumptionMetricsKey { + tenant_id, + timeline_id: Some(timeline_id), + metric: "written_size", + } + .absolute_values() + } + + /// Values will be the difference of the latest written_size (last_record_lsn) to what we + /// previously sent. + const fn written_size_delta( + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> IncrementalValueFactory { + PageserverConsumptionMetricsKey { + tenant_id, + timeline_id: Some(timeline_id), + metric: "written_size_bytes_delta", + } + .incremental_values() + } + + const fn timeline_logical_size( + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> AbsoluteValueFactory { + PageserverConsumptionMetricsKey { + tenant_id, + timeline_id: Some(timeline_id), + metric: "timeline_logical_size", + } + .absolute_values() + } + + const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory { + PageserverConsumptionMetricsKey { + tenant_id, + timeline_id: None, + metric: "remote_storage_size", + } + .absolute_values() + } + + const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory { + PageserverConsumptionMetricsKey { + tenant_id, + timeline_id: None, + metric: "resident_size", + } + .absolute_values() + } + + const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory { + PageserverConsumptionMetricsKey { + tenant_id, + timeline_id: None, + metric: "synthetic_storage_size", + } + .absolute_values() + } +} + /// Main thread that serves metrics collection pub async fn collect_metrics( metric_collection_endpoint: &Url, @@ -79,7 +188,7 @@ pub async fn collect_metrics( .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT) .build() .expect("Failed to create http client with timeout"); - let mut cached_metrics: HashMap = HashMap::new(); + let mut cached_metrics = HashMap::new(); let mut prev_iteration_time: std::time::Instant = std::time::Instant::now(); loop { @@ -121,13 +230,13 @@ pub async fn collect_metrics( /// - refactor this function (chunking+sending part) to reuse it in proxy module; pub async fn collect_metrics_iteration( client: &reqwest::Client, - cached_metrics: &mut HashMap, + cached_metrics: &mut HashMap, metric_collection_endpoint: &reqwest::Url, node_id: NodeId, ctx: &RequestContext, send_cached: bool, ) { - let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new(); + let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, (EventType, u64))> = Vec::new(); trace!( "starting collect_metrics_iteration. metric_collection_endpoint: {}", metric_collection_endpoint @@ -166,27 +275,80 @@ pub async fn collect_metrics_iteration( if timeline.is_active() { let timeline_written_size = u64::from(timeline.get_last_record_lsn()); - current_metrics.push(( - PageserverConsumptionMetricsKey { - tenant_id, - timeline_id: Some(timeline.timeline_id), - metric: WRITTEN_SIZE, + let (key, written_size_now) = + PageserverConsumptionMetricsKey::written_size(tenant_id, timeline.timeline_id) + .now(timeline_written_size); + + // last_record_lsn can only go up, right now at least, TODO: #2592 or related + // features might change this. + + let written_size_delta_key = PageserverConsumptionMetricsKey::written_size_delta( + tenant_id, + timeline.timeline_id, + ); + + // use this when available, because in a stream of incremental values, it will be + // accurate where as when last_record_lsn stops moving, we will only cache the last + // one of those. + let last_stop_time = + cached_metrics + .get(written_size_delta_key.key()) + .map(|(until, _val)| { + until + .incremental_timerange() + .expect("never create EventType::Absolute for written_size_delta") + .end + }); + + // by default, use the last sent written_size as the basis for + // calculating the delta. if we don't yet have one, use the load time value. + let prev = cached_metrics + .get(&key) + .map(|(prev_at, prev)| { + // use the prev time from our last incremental update, or default to latest + // absolute update on the first round. + let prev_at = prev_at + .absolute_time() + .expect("never create EventType::Incremental for written_size"); + let prev_at = last_stop_time.unwrap_or(prev_at); + (*prev_at, *prev) + }) + .unwrap_or_else(|| { + // if we don't have a previous point of comparison, compare to the load time + // lsn. + let (disk_consistent_lsn, loaded_at) = &timeline.loaded_at; + (DateTime::from(*loaded_at), disk_consistent_lsn.0) + }); + + // written_size_delta_bytes + current_metrics.extend( + if let Some(delta) = written_size_now.1.checked_sub(prev.1) { + let up_to = written_size_now + .0 + .absolute_time() + .expect("never create EventType::Incremental for written_size"); + let key_value = + written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta); + Some(key_value) + } else { + None }, - timeline_written_size, - )); + ); + + // written_size + current_metrics.push((key, written_size_now)); let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id); match span.in_scope(|| timeline.get_current_logical_size(ctx)) { // Only send timeline logical size when it is fully calculated. Ok((size, is_exact)) if is_exact => { - current_metrics.push(( - PageserverConsumptionMetricsKey { + current_metrics.push( + PageserverConsumptionMetricsKey::timeline_logical_size( tenant_id, - timeline_id: Some(timeline.timeline_id), - metric: TIMELINE_LOGICAL_SIZE, - }, - size, - )); + timeline.timeline_id, + ) + .now(size), + ); } Ok((_, _)) => {} Err(err) => { @@ -205,14 +367,10 @@ pub async fn collect_metrics_iteration( match tenant.get_remote_size().await { Ok(tenant_remote_size) => { - current_metrics.push(( - PageserverConsumptionMetricsKey { - tenant_id, - timeline_id: None, - metric: REMOTE_STORAGE_SIZE, - }, - tenant_remote_size, - )); + current_metrics.push( + PageserverConsumptionMetricsKey::remote_storage_size(tenant_id) + .now(tenant_remote_size), + ); } Err(err) => { error!( @@ -222,14 +380,9 @@ pub async fn collect_metrics_iteration( } } - current_metrics.push(( - PageserverConsumptionMetricsKey { - tenant_id, - timeline_id: None, - metric: RESIDENT_SIZE, - }, - tenant_resident_size, - )); + current_metrics.push( + PageserverConsumptionMetricsKey::resident_size(tenant_id).now(tenant_resident_size), + ); // Note that this metric is calculated in a separate bgworker // Here we only use cached value, which may lag behind the real latest one @@ -237,23 +390,27 @@ pub async fn collect_metrics_iteration( if tenant_synthetic_size != 0 { // only send non-zeroes because otherwise these show up as errors in logs - current_metrics.push(( - PageserverConsumptionMetricsKey { - tenant_id, - timeline_id: None, - metric: SYNTHETIC_STORAGE_SIZE, - }, - tenant_synthetic_size, - )); + current_metrics.push( + PageserverConsumptionMetricsKey::synthetic_size(tenant_id) + .now(tenant_synthetic_size), + ); } } // Filter metrics, unless we want to send all metrics, including cached ones. // See: https://github.com/neondatabase/neon/issues/3485 if !send_cached { - current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) { - Some(val) => val != curr_val, - None => true, + current_metrics.retain(|(curr_key, (kind, curr_val))| { + if kind.is_incremental() { + // incremental values (currently only written_size_delta) should not get any cache + // deduplication because they will be used by upstream for "is still alive." + true + } else { + match cached_metrics.get(curr_key) { + Some((_, val)) => val != curr_val, + None => true, + } + } }); } @@ -272,8 +429,8 @@ pub async fn collect_metrics_iteration( chunk_to_send.clear(); // enrich metrics with type,timestamp and idempotency key before sending - chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| Event { - kind: EventType::Absolute { time: Utc::now() }, + chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event { + kind: *when, metric: curr_key.metric, idempotency_key: idempotency_key(node_id.to_string()), value: *curr_val, diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 042d4c6d06..b729b6a643 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -545,12 +545,12 @@ async fn collect_eviction_candidates( // We could be better here, e.g., sum of all L0 layers + most recent L1 layer. // That's what's typically used by the various background loops. // - // The default can be overriden with a fixed value in the tenant conf. + // The default can be overridden with a fixed value in the tenant conf. // A default override can be put in the default tenant conf in the pageserver.toml. let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() { debug!( tenant_id=%tenant.tenant_id(), - overriden_size=s, + overridden_size=s, "using overridden min resident size for tenant" ); s diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 5831091098..f43651e931 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -109,6 +109,8 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp"; /// Full path: `tenants//timelines/___uninit`. pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit"; +pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete"; + /// A marker file to prevent pageserver from loading a certain tenant on restart. /// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding /// `ignore` management API command, that expects the ignored tenant to be properly loaded @@ -123,15 +125,30 @@ pub fn is_temporary(path: &Path) -> bool { } } -pub fn is_uninit_mark(path: &Path) -> bool { +fn ends_with_suffix(path: &Path, suffix: &str) -> bool { match path.file_name() { - Some(name) => name - .to_string_lossy() - .ends_with(TIMELINE_UNINIT_MARK_SUFFIX), + Some(name) => name.to_string_lossy().ends_with(suffix), None => false, } } +pub fn is_uninit_mark(path: &Path) -> bool { + ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX) +} + +pub fn is_delete_mark(path: &Path) -> bool { + ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX) +} + +fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool { + if let Some(e) = e.io_error() { + if e.kind() == std::io::ErrorKind::NotFound { + return true; + } + } + false +} + /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by /// blocking. /// diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index c233f36da6..84f2d923e2 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -73,7 +73,7 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy = Lazy::new(|| { // Buckets for background operations like compaction, GC, size calculation const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0]; -pub static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { +pub(crate) static STORAGE_TIME_GLOBAL: Lazy = Lazy::new(|| { register_histogram_vec!( "pageserver_storage_operations_seconds_global", "Time spent on storage operations", @@ -93,7 +93,7 @@ pub(crate) static READ_NUM_FS_LAYERS: Lazy = Lazy::new(|| { }); // Metrics collected on operations on the storage repository. -pub static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { +pub(crate) static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_getpage_reconstruct_seconds", "Time spent in reconstruct_value (reconstruct a page from deltas)", @@ -102,7 +102,7 @@ pub static RECONSTRUCT_TIME: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy = Lazy::new(|| { +pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_materialized_cache_hits_direct_total", "Number of cache hits from materialized page cache without redo", @@ -119,7 +119,7 @@ pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { +pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_materialized_cache_hits_total", "Number of cache hits from materialized page cache", @@ -280,7 +280,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy = Lazy::new(|| { +pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_remote_ondemand_downloaded_layers_total", "Total on-demand downloaded layers" @@ -288,7 +288,7 @@ pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy = Lazy::new(|| { .unwrap() }); -pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy = Lazy::new(|| { +pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_remote_ondemand_downloaded_bytes_total", "Total bytes of layers on-demand downloaded", @@ -327,7 +327,7 @@ pub(crate) static BROKEN_TENANTS_SET: Lazy = Lazy::new(|| { .expect("Failed to register pageserver_tenant_states_count metric") }); -pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy = Lazy::new(|| { +pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_tenant_synthetic_cached_size_bytes", "Synthetic size of each tenant in bytes", @@ -385,7 +385,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy = Lazy::new(|| .expect("failed to define a metric") }); -pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy = Lazy::new(|| { +pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_unexpected_ondemand_downloads_count", "Number of unexpected on-demand downloads. \ @@ -690,7 +690,7 @@ pub(crate) static REMOTE_OPERATION_TIME: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { +pub(crate) static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_tenant_task_events", "Number of task start/stop/fail events.", @@ -699,7 +699,7 @@ pub static TENANT_TASK_EVENTS: Lazy = Lazy::new(|| { .expect("Failed to register tenant_task_events metric") }); -pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy = Lazy::new(|| { +pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_background_loop_period_overrun_count", "Incremented whenever warn_when_period_overrun() logs a warning.", @@ -710,7 +710,7 @@ pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy = Lazy::new // walreceiver metrics -pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy = Lazy::new(|| { +pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_walreceiver_started_connections_total", "Number of started walreceiver connections" @@ -718,7 +718,7 @@ pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy = Lazy::new(|| { +pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy = Lazy::new(|| { register_int_gauge!( "pageserver_walreceiver_active_managers", "Number of active walreceiver managers" @@ -726,7 +726,7 @@ pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static WALRECEIVER_SWITCHES: Lazy = Lazy::new(|| { +pub(crate) static WALRECEIVER_SWITCHES: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_walreceiver_switches_total", "Number of walreceiver manager change_connection calls", @@ -735,7 +735,7 @@ pub static WALRECEIVER_SWITCHES: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static WALRECEIVER_BROKER_UPDATES: Lazy = Lazy::new(|| { +pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_walreceiver_broker_updates_total", "Number of received broker updates in walreceiver" @@ -743,7 +743,7 @@ pub static WALRECEIVER_BROKER_UPDATES: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy = Lazy::new(|| { +pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy = Lazy::new(|| { register_int_counter_vec!( "pageserver_walreceiver_candidates_events_total", "Number of walreceiver candidate events", @@ -752,10 +752,10 @@ pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static WALRECEIVER_CANDIDATES_ADDED: Lazy = +pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy = Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"])); -pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy = +pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy = Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"])); // Metrics collected on WAL redo operations @@ -802,7 +802,7 @@ macro_rules! redo_bytes_histogram_count_buckets { }; } -pub static WAL_REDO_TIME: Lazy = Lazy::new(|| { +pub(crate) static WAL_REDO_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_seconds", "Time spent on WAL redo", @@ -811,7 +811,7 @@ pub static WAL_REDO_TIME: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static WAL_REDO_WAIT_TIME: Lazy = Lazy::new(|| { +pub(crate) static WAL_REDO_WAIT_TIME: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_wait_seconds", "Time spent waiting for access to the Postgres WAL redo process", @@ -820,7 +820,7 @@ pub static WAL_REDO_WAIT_TIME: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy = Lazy::new(|| { +pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_records_histogram", "Histogram of number of records replayed per redo in the Postgres WAL redo process", @@ -829,7 +829,7 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static WAL_REDO_BYTES_HISTOGRAM: Lazy = Lazy::new(|| { +pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy = Lazy::new(|| { register_histogram!( "pageserver_wal_redo_bytes_histogram", "Histogram of number of records replayed per redo sent to Postgres", @@ -838,7 +838,8 @@ pub static WAL_REDO_BYTES_HISTOGRAM: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); -pub static WAL_REDO_RECORD_COUNTER: Lazy = Lazy::new(|| { +// FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count? +pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy = Lazy::new(|| { register_int_counter!( "pageserver_replayed_wal_records_total", "Number of WAL records replayed in WAL redo process" @@ -1394,15 +1395,51 @@ impl>, O, E> Future for MeasuredRemoteOp { } pub fn preinitialize_metrics() { - // We want to alert on this metric increasing. - // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0. - assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0); - UNEXPECTED_ONDEMAND_DOWNLOADS.reset(); + // Python tests need these and on some we do alerting. + // + // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of + // order: + // - global metrics reside in a Lazy + // - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc() + // - could move the statics into TimelineMetrics::new()? - // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels. - BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset(); + // counters + [ + &MATERIALIZED_PAGE_CACHE_HIT, + &MATERIALIZED_PAGE_CACHE_HIT_DIRECT, + &UNEXPECTED_ONDEMAND_DOWNLOADS, + &WALRECEIVER_STARTED_CONNECTIONS, + &WALRECEIVER_BROKER_UPDATES, + &WALRECEIVER_CANDIDATES_ADDED, + &WALRECEIVER_CANDIDATES_REMOVED, + ] + .into_iter() + .for_each(|c| { + Lazy::force(c); + }); - // Python tests need these. - MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get(); - MATERIALIZED_PAGE_CACHE_HIT.get(); + // countervecs + [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT] + .into_iter() + .for_each(|c| { + Lazy::force(c); + }); + + // gauges + WALRECEIVER_ACTIVE_MANAGERS.get(); + + // histograms + [ + &READ_NUM_FS_LAYERS, + &RECONSTRUCT_TIME, + &WAIT_LSN_TIME, + &WAL_REDO_TIME, + &WAL_REDO_WAIT_TIME, + &WAL_REDO_RECORDS_HISTOGRAM, + &WAL_REDO_BYTES_HISTOGRAM, + ] + .into_iter() + .for_each(|h| { + Lazy::force(h); + }); } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 0b1183fc50..1b287ee10f 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -18,7 +18,6 @@ use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; use storage_broker::BrokerClientChannel; use tokio::sync::watch; -use tokio::sync::OwnedMutexGuard; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; @@ -29,7 +28,6 @@ use std::cmp::min; use std::collections::hash_map::Entry; use std::collections::BTreeSet; use std::collections::HashMap; -use std::ffi::OsStr; use std::fs; use std::fs::File; use std::fs::OpenOptions; @@ -48,6 +46,7 @@ use std::sync::{Mutex, RwLock}; use std::time::{Duration, Instant}; use self::config::TenantConf; +use self::metadata::LoadMetadataError; use self::metadata::TimelineMetadata; use self::remote_timeline_client::RemoteTimelineClient; use self::timeline::uninit::TimelineUninitMark; @@ -65,12 +64,12 @@ use crate::tenant::config::TenantConfOpt; use crate::tenant::metadata::load_metadata; use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; -use crate::tenant::remote_timeline_client::PersistIndexPartWithDeletedFlagError; use crate::tenant::storage_layer::DeltaLayer; use crate::tenant::storage_layer::ImageLayer; use crate::tenant::storage_layer::Layer; use crate::InitializationOrder; +use crate::tenant::timeline::delete::DeleteTimelineFlow; use crate::tenant::timeline::uninit::cleanup_timeline_directory; use crate::virtual_file::VirtualFile; use crate::walredo::PostgresRedoManager; @@ -266,6 +265,14 @@ pub enum GetTimelineError { }, } +#[derive(Debug, thiserror::Error)] +pub enum LoadLocalTimelineError { + #[error("FailedToLoad")] + Load(#[source] anyhow::Error), + #[error("FailedToResumeDeletion")] + ResumeDeletion(#[source] anyhow::Error), +} + #[derive(Debug, thiserror::Error)] pub enum DeleteTimelineError { #[error("NotFound")] @@ -319,14 +326,6 @@ impl std::fmt::Display for WaitToBecomeActiveError { } } -struct DeletionGuard(OwnedMutexGuard); - -impl DeletionGuard { - fn is_deleted(&self) -> bool { - *self.0 - } -} - #[derive(thiserror::Error, Debug)] pub enum CreateTimelineError { #[error("a timeline with the given ID already exists")] @@ -337,6 +336,16 @@ pub enum CreateTimelineError { Other(#[from] anyhow::Error), } +struct TenantDirectoryScan { + sorted_timelines_to_load: Vec<(TimelineId, TimelineMetadata)>, + timelines_to_resume_deletion: Vec<(TimelineId, Option)>, +} + +enum CreateTimelineCause { + Load, + Delete, +} + impl Tenant { /// Yet another helper for timeline initialization. /// Contains the common part of `load_local_timeline` and `load_remote_timeline`. @@ -375,6 +384,7 @@ impl Tenant { ancestor.clone(), remote_client, init_order, + CreateTimelineCause::Load, )?; let new_disk_consistent_lsn = timeline.get_disk_consistent_lsn(); anyhow::ensure!( @@ -803,11 +813,14 @@ impl Tenant { tenant } - pub fn scan_and_sort_timelines_dir( - self: Arc, - ) -> anyhow::Result> { - let timelines_dir = self.conf.timelines_path(&self.tenant_id); + fn scan_and_sort_timelines_dir(self: Arc) -> anyhow::Result { let mut timelines_to_load: HashMap = HashMap::new(); + // Note timelines_to_resume_deletion needs to be separate because it can be not sortable + // from the point of `tree_sort_timelines`. I e some parents can be missing because deletion + // completed in non topological order (for example because parent has smaller number of layer files in it) + let mut timelines_to_resume_deletion: Vec<(TimelineId, Option)> = vec![]; + + let timelines_dir = self.conf.timelines_path(&self.tenant_id); for entry in std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")? @@ -835,16 +848,13 @@ impl Tenant { ); continue; } + let timeline_uninit_mark_file = &timeline_dir; info!( "Found an uninit mark file {}, removing the timeline and its uninit mark", timeline_uninit_mark_file.display() ); - let timeline_id = timeline_uninit_mark_file - .file_stem() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() + let timeline_id = TimelineId::try_from(timeline_uninit_mark_file.file_stem()) .with_context(|| { format!( "Could not parse timeline id out of the timeline uninit mark name {}", @@ -857,6 +867,47 @@ impl Tenant { { error!("Failed to clean up uninit marked timeline: {e:?}"); } + } else if crate::is_delete_mark(&timeline_dir) { + // If metadata exists, load as usual, continue deletion + let timeline_id = + TimelineId::try_from(timeline_dir.file_stem()).with_context(|| { + format!( + "Could not parse timeline id out of the timeline uninit mark name {}", + timeline_dir.display() + ) + })?; + + match load_metadata(self.conf, &self.tenant_id, &timeline_id) { + Ok(metadata) => { + timelines_to_resume_deletion.push((timeline_id, Some(metadata))) + } + Err(e) => match &e { + LoadMetadataError::Read(r) => { + if r.kind() != io::ErrorKind::NotFound { + return Err(anyhow::anyhow!(e)).with_context(|| { + format!("Failed to load metadata for timeline_id {timeline_id}") + }); + } + + // If metadata doesnt exist it means that we've crashed without + // completing cleanup_remaining_timeline_fs_traces in DeleteTimelineFlow. + // So save timeline_id for later call to `DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`. + // We cant do it here because the method is async so we'd need block_on + // and here we're in spawn_blocking. cleanup_remaining_timeline_fs_traces uses fs operations + // so that basically results in a cycle: + // spawn_blocking + // - block_on + // - spawn_blocking + // which can lead to running out of threads in blocing pool. + timelines_to_resume_deletion.push((timeline_id, None)); + } + _ => { + return Err(anyhow::anyhow!(e)).with_context(|| { + format!("Failed to load metadata for timeline_id {timeline_id}") + }) + } + }, + } } else { if !timeline_dir.exists() { warn!( @@ -865,12 +916,8 @@ impl Tenant { ); continue; } - let timeline_id = timeline_dir - .file_name() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .with_context(|| { + let timeline_id = + TimelineId::try_from(timeline_dir.file_name()).with_context(|| { format!( "Could not parse timeline id out of the timeline dir name {}", timeline_dir.display() @@ -892,6 +939,14 @@ impl Tenant { continue; } + let timeline_delete_mark_file = self + .conf + .timeline_delete_mark_file_path(self.tenant_id, timeline_id); + if timeline_delete_mark_file.exists() { + // Cleanup should be done in `is_delete_mark` branch above + continue; + } + let file_name = entry.file_name(); if let Ok(timeline_id) = file_name.to_str().unwrap_or_default().parse::() @@ -911,7 +966,10 @@ impl Tenant { // Sort the array of timeline IDs into tree-order, so that parent comes before // all its children. - tree_sort_timelines(timelines_to_load) + tree_sort_timelines(timelines_to_load).map(|sorted_timelines| TenantDirectoryScan { + sorted_timelines_to_load: sorted_timelines, + timelines_to_resume_deletion, + }) } /// @@ -937,7 +995,7 @@ impl Tenant { let span = info_span!("blocking"); let cloned = Arc::clone(self); - let sorted_timelines: Vec<(_, _)> = tokio::task::spawn_blocking(move || { + let scan = tokio::task::spawn_blocking(move || { let _g = span.entered(); cloned.scan_and_sort_timelines_dir() }) @@ -948,10 +1006,60 @@ impl Tenant { // FIXME original collect_timeline_files contained one more check: // 1. "Timeline has no ancestor and no layer files" - for (timeline_id, local_metadata) in sorted_timelines { - self.load_local_timeline(timeline_id, local_metadata, init_order, ctx) + // Process loadable timelines first + for (timeline_id, local_metadata) in scan.sorted_timelines_to_load { + if let Err(e) = self + .load_local_timeline(timeline_id, local_metadata, init_order, ctx, false) .await - .with_context(|| format!("load local timeline {timeline_id}"))?; + { + match e { + LoadLocalTimelineError::Load(source) => { + return Err(anyhow::anyhow!(source) + .context("Failed to load local timeline: {timeline_id}")) + } + LoadLocalTimelineError::ResumeDeletion(source) => { + // Make sure resumed deletion wont fail loading for entire tenant. + error!("Failed to resume timeline deletion: {source:#}") + } + } + } + } + + // Resume deletion ones with deleted_mark + for (timeline_id, maybe_local_metadata) in scan.timelines_to_resume_deletion { + match maybe_local_metadata { + None => { + // See comment in `scan_and_sort_timelines_dir`. + if let Err(e) = + DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(self, timeline_id) + .await + { + warn!( + "cannot clean up deleted timeline dir timeline_id: {} error: {:#}", + timeline_id, e + ); + } + } + Some(local_metadata) => { + if let Err(e) = self + .load_local_timeline(timeline_id, local_metadata, init_order, ctx, true) + .await + { + match e { + LoadLocalTimelineError::Load(source) => { + // We tried to load deleted timeline, this is a bug. + return Err(anyhow::anyhow!(source).context( + "This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}" + )); + } + LoadLocalTimelineError::ResumeDeletion(source) => { + // Make sure resumed deletion wont fail loading for entire tenant. + error!("Failed to resume timeline deletion: {source:#}") + } + } + } + } + } } trace!("Done"); @@ -969,7 +1077,8 @@ impl Tenant { local_metadata: TimelineMetadata, init_order: Option<&InitializationOrder>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + found_delete_mark: bool, + ) -> Result<(), LoadLocalTimelineError> { span::debug_assert_current_span_has_tenant_id(); let remote_client = self.remote_storage.as_ref().map(|remote_storage| { @@ -981,14 +1090,6 @@ impl Tenant { ) }); - let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() { - let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false) - .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?; - Some(ancestor_timeline) - } else { - None - }; - let (remote_startup_data, remote_client) = match remote_client { Some(remote_client) => match remote_client.download_index_file().await { Ok(index_part) => { @@ -1008,45 +1109,29 @@ impl Tenant { info!("is_deleted is set on remote, resuming removal of timeline data originally done by timeline deletion handler"); remote_client - .init_upload_queue_stopped_to_continue_deletion(&index_part)?; + .init_upload_queue_stopped_to_continue_deletion(&index_part) + .context("init queue stopped") + .map_err(LoadLocalTimelineError::ResumeDeletion)?; - let timeline = self - .create_timeline_struct( - timeline_id, - &local_metadata, - ancestor, - Some(remote_client), - init_order, - ) - .context("create_timeline_struct")?; - - let guard = DeletionGuard( - Arc::clone(&timeline.delete_lock) - .try_lock_owned() - .expect("cannot happen because we're the only owner"), - ); - - // Note: here we even skip populating layer map. Timeline is essentially uninitialized. - // RemoteTimelineClient is the only functioning part. - timeline.set_state(TimelineState::Stopping); - // We meed to do this because when console retries delete request we shouldnt answer with 404 - // because 404 means successful deletion. - // FIXME consider TimelineState::Deleting. - let mut locked = self.timelines.lock().unwrap(); - locked.insert(timeline_id, Arc::clone(&timeline)); - - Tenant::schedule_delete_timeline( + DeleteTimelineFlow::resume_deletion( Arc::clone(self), timeline_id, - timeline, - guard, - ); + &local_metadata, + Some(remote_client), + init_order, + ) + .await + .context("resume deletion") + .map_err(LoadLocalTimelineError::ResumeDeletion)?; return Ok(()); } }; - let remote_metadata = index_part.parse_metadata().context("parse_metadata")?; + let remote_metadata = index_part + .parse_metadata() + .context("parse_metadata") + .map_err(LoadLocalTimelineError::Load)?; ( Some(RemoteStartupData { index_part, @@ -1056,12 +1141,54 @@ impl Tenant { ) } Err(DownloadError::NotFound) => { - info!("no index file was found on the remote"); + info!("no index file was found on the remote, found_delete_mark: {found_delete_mark}"); + + if found_delete_mark { + // We could've resumed at a point where remote index was deleted, but metadata file wasnt. + // Cleanup: + return DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces( + self, + timeline_id, + ) + .await + .context("cleanup_remaining_timeline_fs_traces") + .map_err(LoadLocalTimelineError::ResumeDeletion); + } + + // We're loading fresh timeline that didnt yet make it into remote. (None, Some(remote_client)) } - Err(e) => return Err(anyhow::anyhow!(e)), + Err(e) => return Err(LoadLocalTimelineError::Load(anyhow::Error::new(e))), }, - None => (None, remote_client), + None => { + // No remote client + if found_delete_mark { + // There is no remote client, we found local metadata. + // Continue cleaning up local disk. + DeleteTimelineFlow::resume_deletion( + Arc::clone(self), + timeline_id, + &local_metadata, + None, + init_order, + ) + .await + .context("resume deletion") + .map_err(LoadLocalTimelineError::ResumeDeletion)?; + return Ok(()); + } + + (None, remote_client) + } + }; + + let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() { + let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false) + .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}")) + .map_err(LoadLocalTimelineError::Load)?; + Some(ancestor_timeline) + } else { + None }; self.timeline_init_and_sync( @@ -1075,6 +1202,7 @@ impl Tenant { ctx, ) .await + .map_err(LoadLocalTimelineError::Load) } pub fn tenant_id(&self) -> TenantId { @@ -1435,269 +1563,6 @@ impl Tenant { } } - /// Shuts down a timeline's tasks, removes its in-memory structures, and deletes its - /// data from both disk and s3. - async fn delete_timeline( - &self, - timeline_id: TimelineId, - timeline: Arc, - guard: DeletionGuard, - ) -> anyhow::Result<()> { - { - // Grab the layer_removal_cs lock, and actually perform the deletion. - // - // This lock prevents prevents GC or compaction from running at the same time. - // The GC task doesn't register itself with the timeline it's operating on, - // so it might still be running even though we called `shutdown_tasks`. - // - // Note that there are still other race conditions between - // GC, compaction and timeline deletion. See - // https://github.com/neondatabase/neon/issues/2671 - // - // No timeout here, GC & Compaction should be responsive to the - // `TimelineState::Stopping` change. - info!("waiting for layer_removal_cs.lock()"); - let layer_removal_guard = timeline.layer_removal_cs.lock().await; - info!("got layer_removal_cs.lock(), deleting layer files"); - - // NB: remote_timeline_client upload tasks that reference these layers have been cancelled - // by the caller. - - let local_timeline_directory = self - .conf - .timeline_path(&self.tenant_id, &timeline.timeline_id); - - fail::fail_point!("timeline-delete-before-rm", |_| { - Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))? - }); - - // NB: This need not be atomic because the deleted flag in the IndexPart - // will be observed during tenant/timeline load. The deletion will be resumed there. - // - // For configurations without remote storage, we tolerate that we're not crash-safe here. - // The timeline may come up Active but with missing layer files, in such setups. - // See https://github.com/neondatabase/neon/pull/3919#issuecomment-1531726720 - match std::fs::remove_dir_all(&local_timeline_directory) { - Err(e) if e.kind() == std::io::ErrorKind::NotFound => { - // This can happen if we're called a second time, e.g., - // because of a previous failure/cancellation at/after - // failpoint timeline-delete-after-rm. - // - // It can also happen if we race with tenant detach, because, - // it doesn't grab the layer_removal_cs lock. - // - // For now, log and continue. - // warn! level is technically not appropriate for the - // first case because we should expect retries to happen. - // But the error is so rare, it seems better to get attention if it happens. - let tenant_state = self.current_state(); - warn!( - timeline_dir=?local_timeline_directory, - ?tenant_state, - "timeline directory not found, proceeding anyway" - ); - // continue with the rest of the deletion - } - res => res.with_context(|| { - format!( - "Failed to remove local timeline directory '{}'", - local_timeline_directory.display() - ) - })?, - } - - info!("finished deleting layer files, releasing layer_removal_cs.lock()"); - drop(layer_removal_guard); - } - - fail::fail_point!("timeline-delete-after-rm", |_| { - Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))? - }); - - if let Some(remote_client) = &timeline.remote_client { - remote_client.delete_all().await.context("delete_all")? - }; - - pausable_failpoint!("in_progress_delete"); - - { - // Remove the timeline from the map. - let mut timelines = self.timelines.lock().unwrap(); - let children_exist = timelines - .iter() - .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); - // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`. - // We already deleted the layer files, so it's probably best to panic. - // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart) - if children_exist { - panic!("Timeline grew children while we removed layer files"); - } - - timelines.remove(&timeline_id).expect( - "timeline that we were deleting was concurrently removed from 'timelines' map", - ); - - drop(timelines); - } - - drop(guard); - - Ok(()) - } - - /// Removes timeline-related in-memory data and schedules removal from remote storage. - #[instrument(skip(self, _ctx))] - pub async fn prepare_and_schedule_delete_timeline( - self: Arc, - timeline_id: TimelineId, - _ctx: &RequestContext, - ) -> Result<(), DeleteTimelineError> { - debug_assert_current_span_has_tenant_and_timeline_id(); - - // Transition the timeline into TimelineState::Stopping. - // This should prevent new operations from starting. - // - // Also grab the Timeline's delete_lock to prevent another deletion from starting. - let timeline; - let delete_lock_guard; - { - let mut timelines = self.timelines.lock().unwrap(); - - // Ensure that there are no child timelines **attached to that pageserver**, - // because detach removes files, which will break child branches - let children: Vec = timelines - .iter() - .filter_map(|(id, entry)| { - if entry.get_ancestor_timeline_id() == Some(timeline_id) { - Some(*id) - } else { - None - } - }) - .collect(); - - if !children.is_empty() { - return Err(DeleteTimelineError::HasChildren(children)); - } - - let timeline_entry = match timelines.entry(timeline_id) { - Entry::Occupied(e) => e, - Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound), - }; - - timeline = Arc::clone(timeline_entry.get()); - - // Prevent two tasks from trying to delete the timeline at the same time. - delete_lock_guard = DeletionGuard( - Arc::clone(&timeline.delete_lock) - .try_lock_owned() - .map_err(|_| DeleteTimelineError::AlreadyInProgress)?, - ); - - // If another task finished the deletion just before we acquired the lock, - // return success. - if delete_lock_guard.is_deleted() { - return Ok(()); - } - - timeline.set_state(TimelineState::Stopping); - - drop(timelines); - } - - // Now that the Timeline is in Stopping state, request all the related tasks to - // shut down. - // - // NB: If this fails half-way through, and is retried, the retry will go through - // all the same steps again. Make sure the code here is idempotent, and don't - // error out if some of the shutdown tasks have already been completed! - - // Stop the walreceiver first. - debug!("waiting for wal receiver to shutdown"); - let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() }; - if let Some(walreceiver) = maybe_started_walreceiver { - walreceiver.stop().await; - } - debug!("wal receiver shutdown confirmed"); - - // Prevent new uploads from starting. - if let Some(remote_client) = timeline.remote_client.as_ref() { - let res = remote_client.stop(); - match res { - Ok(()) => {} - Err(e) => match e { - remote_timeline_client::StopError::QueueUninitialized => { - // This case shouldn't happen currently because the - // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart. - // That is, before we declare the Tenant as Active. - // But we only allow calls to delete_timeline on Active tenants. - return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs"))); - } - }, - } - } - - // Stop & wait for the remaining timeline tasks, including upload tasks. - // NB: This and other delete_timeline calls do not run as a task_mgr task, - // so, they are not affected by this shutdown_tasks() call. - info!("waiting for timeline tasks to shutdown"); - task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await; - - // Mark timeline as deleted in S3 so we won't pick it up next time - // during attach or pageserver restart. - // See comment in persist_index_part_with_deleted_flag. - if let Some(remote_client) = timeline.remote_client.as_ref() { - match remote_client.persist_index_part_with_deleted_flag().await { - // If we (now, or already) marked it successfully as deleted, we can proceed - Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), - // Bail out otherwise - // - // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents - // two tasks from performing the deletion at the same time. The first task - // that starts deletion should run it to completion. - Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) - | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { - return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); - } - } - } - self.schedule_delete_timeline(timeline_id, timeline, delete_lock_guard); - - Ok(()) - } - - fn schedule_delete_timeline( - self: Arc, - timeline_id: TimelineId, - timeline: Arc, - guard: DeletionGuard, - ) { - let tenant_id = self.tenant_id; - let timeline_clone = Arc::clone(&timeline); - - task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - TaskKind::TimelineDeletionWorker, - Some(self.tenant_id), - Some(timeline_id), - "timeline_delete", - false, - async move { - if let Err(err) = self.delete_timeline(timeline_id, timeline, guard).await { - error!("Error: {err:#}"); - timeline_clone.set_broken(err.to_string()) - }; - Ok(()) - } - .instrument({ - let span = - tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id); - span.follows_from(Span::current()); - span - }), - ); - } - pub fn current_state(&self) -> TenantState { self.state.borrow().clone() } @@ -2154,6 +2019,10 @@ impl Tenant { /// The returned Timeline is in Loading state. The caller is responsible for /// initializing any on-disk state, and for inserting the Timeline to the 'timelines' /// map. + /// + /// `validate_ancestor == false` is used when a timeline is created for deletion + /// and we might not have the ancestor present anymore which is fine for to be + /// deleted timelines. fn create_timeline_struct( &self, new_timeline_id: TimelineId, @@ -2161,19 +2030,26 @@ impl Tenant { ancestor: Option>, remote_client: Option, init_order: Option<&InitializationOrder>, + cause: CreateTimelineCause, ) -> anyhow::Result> { - if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() { - anyhow::ensure!( - ancestor.is_some(), - "Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found" - ) - } + let state = match cause { + CreateTimelineCause::Load => { + let ancestor_id = new_metadata.ancestor_timeline(); + anyhow::ensure!( + ancestor_id == ancestor.as_ref().map(|t| t.timeline_id), + "Timeline's {new_timeline_id} ancestor {ancestor_id:?} was not found" + ); + TimelineState::Loading + } + CreateTimelineCause::Delete => TimelineState::Stopping, + }; let initial_logical_size_can_start = init_order.map(|x| &x.initial_logical_size_can_start); let initial_logical_size_attempt = init_order.map(|x| &x.initial_logical_size_attempt); let pg_version = new_metadata.pg_version(); - Ok(Timeline::new( + + let timeline = Timeline::new( self.conf, Arc::clone(&self.tenant_conf), new_metadata, @@ -2185,7 +2061,10 @@ impl Tenant { pg_version, initial_logical_size_can_start.cloned(), initial_logical_size_attempt.cloned(), - )) + state, + ); + + Ok(timeline) } fn new( @@ -2852,7 +2731,14 @@ impl Tenant { }; let timeline_struct = self - .create_timeline_struct(new_timeline_id, new_metadata, ancestor, remote_client, None) + .create_timeline_struct( + new_timeline_id, + new_metadata, + ancestor, + remote_client, + None, + CreateTimelineCause::Load, + ) .context("Failed to create timeline data structure")?; timeline_struct.init_empty_layer_map(start_lsn); @@ -3270,19 +3156,6 @@ pub async fn dump_layerfile_from_path( Ok(()) } -fn ignore_absent_files(fs_operation: F) -> io::Result<()> -where - F: Fn() -> io::Result<()>, -{ - fs_operation().or_else(|e| { - if e.kind() == io::ErrorKind::NotFound { - Ok(()) - } else { - Err(e) - } - }) -} - #[cfg(test)] pub mod harness { use bytes::{Bytes, BytesMut}; @@ -3961,9 +3834,9 @@ mod tests { .await .err() .expect("should fail"); - // get all the stack with all .context, not tonly the last one + // get all the stack with all .context, not only the last one let message = format!("{err:#}"); - let expected = "Failed to parse metadata bytes from path"; + let expected = "failed to load metadata"; assert!( message.contains(expected), "message '{message}' expected to contain {expected}" @@ -3980,7 +3853,8 @@ mod tests { } assert!( found_error_message, - "didn't find the corrupted metadata error" + "didn't find the corrupted metadata error in {}", + message ); Ok(()) diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index 734409a619..518286ddb0 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -390,39 +390,42 @@ where } #[allow(dead_code)] - pub fn dump(&self) -> Result<()> { - self.dump_recurse(self.root_blk, &[], 0) - } + pub async fn dump(&self) -> Result<()> { + let mut stack = Vec::new(); - fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> { - let blk = self.reader.read_blk(self.start_blk + blknum)?; - let buf: &[u8] = blk.as_ref(); + stack.push((self.root_blk, String::new(), 0, 0, 0)); - let node = OnDiskNode::::deparse(buf)?; + while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() { + let blk = self.reader.read_blk(self.start_blk + blknum)?; + let buf: &[u8] = blk.as_ref(); + let node = OnDiskNode::::deparse(buf)?; - print!("{:indent$}", "", indent = depth * 2); - println!( - "blk #{}: path {}: prefix {}, suffix_len {}", - blknum, - hex::encode(path), - hex::encode(node.prefix), - node.suffix_len - ); + if child_idx == 0 { + print!("{:indent$}", "", indent = depth * 2); + let path_prefix = stack + .iter() + .map(|(_blknum, path, ..)| path.as_str()) + .collect::(); + println!( + "blk #{blknum}: path {path_prefix}{path}: prefix {}, suffix_len {}", + hex::encode(node.prefix), + node.suffix_len + ); + } - let mut idx = 0; - let mut key_off = 0; - while idx < node.num_children { + if child_idx + 1 < node.num_children { + let key_off = key_off + node.suffix_len as usize; + stack.push((blknum, path.clone(), depth, child_idx + 1, key_off)); + } let key = &node.keys[key_off..key_off + node.suffix_len as usize]; - let val = node.value(idx as usize); + let val = node.value(child_idx as usize); + print!("{:indent$}", "", indent = depth * 2 + 2); println!("{}: {}", hex::encode(key), hex::encode(val.0)); if node.level > 0 { - let child_path = [path, node.prefix].concat(); - self.dump_recurse(val.to_blknum(), &child_path, depth + 1)?; + stack.push((val.to_blknum(), hex::encode(node.prefix), depth + 1, 0, 0)); } - idx += 1; - key_off += node.suffix_len as usize; } Ok(()) } @@ -754,8 +757,8 @@ mod tests { } } - #[test] - fn basic() -> Result<()> { + #[tokio::test] + async fn basic() -> Result<()> { let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk); @@ -775,7 +778,7 @@ mod tests { let reader = DiskBtreeReader::new(0, root_offset, disk); - reader.dump()?; + reader.dump().await?; // Test the `get` function on all the keys. for (key, val) in all_data.iter() { @@ -835,8 +838,8 @@ mod tests { Ok(()) } - #[test] - fn lots_of_keys() -> Result<()> { + #[tokio::test] + async fn lots_of_keys() -> Result<()> { let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk); @@ -856,7 +859,7 @@ mod tests { let reader = DiskBtreeReader::new(0, root_offset, disk); - reader.dump()?; + reader.dump().await?; use std::sync::Mutex; @@ -994,8 +997,8 @@ mod tests { /// /// This test contains a particular data set, see disk_btree_test_data.rs /// - #[test] - fn particular_data() -> Result<()> { + #[tokio::test] + async fn particular_data() -> Result<()> { // Build a tree from it let mut disk = TestDisk::new(); let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk); @@ -1022,7 +1025,7 @@ mod tests { })?; assert_eq!(count, disk_btree_test_data::TEST_DATA.len()); - reader.dump()?; + reader.dump().await?; Ok(()) } diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index d52bb66e76..a27602c335 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -9,10 +9,11 @@ //! [`remote_timeline_client`]: super::remote_timeline_client use std::fs::{File, OpenOptions}; -use std::io::Write; +use std::io::{self, Write}; use anyhow::{bail, ensure, Context}; use serde::{Deserialize, Serialize}; +use thiserror::Error; use tracing::info_span; use utils::bin_ser::SerializeError; use utils::{ @@ -267,24 +268,24 @@ pub fn save_metadata( Ok(()) } +#[derive(Error, Debug)] +pub enum LoadMetadataError { + #[error(transparent)] + Read(#[from] io::Error), + + #[error(transparent)] + Decode(#[from] anyhow::Error), +} + pub fn load_metadata( conf: &'static PageServerConf, tenant_id: &TenantId, timeline_id: &TimelineId, -) -> anyhow::Result { +) -> Result { let metadata_path = conf.metadata_path(tenant_id, timeline_id); - let metadata_bytes = std::fs::read(&metadata_path).with_context(|| { - format!( - "Failed to read metadata bytes from path {}", - metadata_path.display() - ) - })?; - TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| { - format!( - "Failed to parse metadata bytes from path {}", - metadata_path.display() - ) - }) + let metadata_bytes = std::fs::read(metadata_path)?; + + Ok(TimelineMetadata::from_bytes(&metadata_bytes)?) } #[cfg(test)] diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index eeb84caf13..25c5e3f2e0 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -26,6 +26,8 @@ use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME}; use utils::fs_ext::PathExt; use utils::id::{TenantId, TimelineId}; +use super::timeline::delete::DeleteTimelineFlow; + /// The tenants known to the pageserver. /// The enum variants are used to distinguish the different states that the pageserver can be in. enum TenantsMap { @@ -421,12 +423,10 @@ pub enum DeleteTimelineError { pub async fn delete_timeline( tenant_id: TenantId, timeline_id: TimelineId, - ctx: &RequestContext, + _ctx: &RequestContext, ) -> Result<(), DeleteTimelineError> { let tenant = get_tenant(tenant_id, true).await?; - tenant - .prepare_and_schedule_delete_timeline(timeline_id, ctx) - .await?; + DeleteTimelineFlow::run(&tenant, timeline_id).await?; Ok(()) } diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 1355356712..8d002a8570 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -514,7 +514,7 @@ impl RemoteTimelineClient { /// updated metadata. /// /// The upload will be added to the queue immediately, but it - /// won't be performed until all previosuly scheduled layer file + /// won't be performed until all previously scheduled layer file /// upload operations have completed successfully. This is to /// ensure that when the index file claims that layers X, Y and Z /// exist in remote storage, they really do. To wait for the upload @@ -625,7 +625,7 @@ impl RemoteTimelineClient { /// Note: This schedules an index file upload before the deletions. The /// deletion won't actually be performed, until any previously scheduled /// upload operations, and the index file upload, have completed - /// succesfully. + /// successfully. pub fn schedule_layer_file_deletion( self: &Arc, names: &[LayerFileName], @@ -827,7 +827,7 @@ impl RemoteTimelineClient { ) }; - receiver.changed().await?; + receiver.changed().await.context("upload queue shut down")?; // Do not delete index part yet, it is needed for possible retry. If we remove it first // and retry will arrive to different pageserver there wont be any traces of it on remote storage @@ -855,11 +855,23 @@ impl RemoteTimelineClient { self.storage_impl.delete_objects(&remaining).await?; } + fail::fail_point!("timeline-delete-before-index-delete", |_| { + Err(anyhow::anyhow!( + "failpoint: timeline-delete-before-index-delete" + ))? + }); + let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME)); debug!("deleting index part"); self.storage_impl.delete(&index_file_path).await?; + fail::fail_point!("timeline-delete-after-index-delete", |_| { + Err(anyhow::anyhow!( + "failpoint: timeline-delete-after-index-delete" + ))? + }); + info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json"); Ok(()) @@ -1105,7 +1117,7 @@ impl RemoteTimelineClient { debug!("remote task {} completed successfully", task.op); } - // The task has completed succesfully. Remove it from the in-progress list. + // The task has completed successfully. Remove it from the in-progress list. { let mut upload_queue_guard = self.upload_queue.lock().unwrap(); let upload_queue = match upload_queue_guard.deref_mut() { diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index c3f6dcadec..fdbf26e6ae 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -223,6 +223,45 @@ mod tests { assert_eq!(part, expected); } + #[test] + fn v2_indexpart_is_parsed_with_deleted_at() { + let example = r#"{ + "version":2, + "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"], + "missing_layers":["This shouldn't fail deserialization"], + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0], + "deleted_at": "2023-07-31T09:00:00.123" + }"#; + + let expected = IndexPart { + // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead? + version: 2, + timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]), + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata { + file_size: 25600000, + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata { + // serde_json should always parse this but this might be a double with jq for + // example. + file_size: 9007199254741001, + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(), + deleted_at: Some(chrono::NaiveDateTime::parse_from_str( + "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()) + }; + + let part = serde_json::from_str::(example).unwrap(); + assert_eq!(part, expected); + } + #[test] fn empty_layers_are_parsed() { let empty_layers_json = r#"{ diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 9585c04120..7574554b4e 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -256,7 +256,7 @@ impl Layer for DeltaLayer { file, ); - tree_reader.dump()?; + tree_reader.dump().await?; let mut cursor = file.block_cursor(); diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 53cff824e3..a5b9d8834e 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -38,6 +38,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; use hex; +use once_cell::sync::OnceCell; use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind}; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; @@ -47,7 +48,6 @@ use std::io::{Seek, SeekFrom}; use std::ops::Range; use std::os::unix::prelude::FileExt; use std::path::{Path, PathBuf}; -use std::sync::{RwLock, RwLockReadGuard}; use tracing::*; use utils::{ @@ -117,7 +117,7 @@ pub struct ImageLayer { access_stats: LayerAccessStats, - inner: RwLock, + inner: OnceCell, } impl std::fmt::Debug for ImageLayer { @@ -134,21 +134,17 @@ impl std::fmt::Debug for ImageLayer { } pub struct ImageLayerInner { - /// If false, the 'index' has not been loaded into memory yet. - loaded: bool, - // values copied from summary index_start_blk: u32, index_root_blk: u32, - /// Reader object for reading blocks from the file. (None if not loaded yet) - file: Option>, + /// Reader object for reading blocks from the file. + file: FileBlockReader, } impl std::fmt::Debug for ImageLayerInner { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("ImageLayerInner") - .field("loaded", &self.loaded) .field("index_start_blk", &self.index_start_blk) .field("index_root_blk", &self.index_root_blk) .finish() @@ -175,11 +171,11 @@ impl Layer for ImageLayer { } let inner = self.load(LayerAccessKind::Dump, ctx)?; - let file = inner.file.as_ref().unwrap(); + let file = &inner.file; let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file); - tree_reader.dump()?; + tree_reader.dump().await?; tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| { println!("key: {} offset {}", hex::encode(key), value); @@ -203,7 +199,7 @@ impl Layer for ImageLayer { let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?; - let file = inner.file.as_ref().unwrap(); + let file = &inner.file; let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file); let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; @@ -322,52 +318,26 @@ impl ImageLayer { /// Open the underlying file and read the metadata into memory, if it's /// not loaded already. /// - fn load( - &self, - access_kind: LayerAccessKind, - ctx: &RequestContext, - ) -> Result> { + fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&ImageLayerInner> { self.access_stats .record_access(access_kind, ctx.task_kind()); loop { - // Quick exit if already loaded - let inner = self.inner.read().unwrap(); - if inner.loaded { + if let Some(inner) = self.inner.get() { return Ok(inner); } - - // Need to open the file and load the metadata. Upgrade our lock to - // a write lock. (Or rather, release and re-lock in write mode.) - drop(inner); - let mut inner = self.inner.write().unwrap(); - if !inner.loaded { - self.load_inner(&mut inner).with_context(|| { - format!("Failed to load image layer {}", self.path().display()) - })? - } else { - // Another thread loaded it while we were not holding the lock. - } - - // We now have the file open and loaded. There's no function to do - // that in the std library RwLock, so we have to release and re-lock - // in read mode. (To be precise, the lock guard was moved in the - // above call to `load_inner`, so it's already been released). And - // while we do that, another thread could unload again, so we have - // to re-check and retry if that happens. - drop(inner); + self.inner + .get_or_try_init(|| self.load_inner()) + .with_context(|| format!("Failed to load image layer {}", self.path().display()))?; } } - fn load_inner(&self, inner: &mut ImageLayerInner) -> Result<()> { + fn load_inner(&self) -> Result { let path = self.path(); // Open the file if it's not open already. - if inner.file.is_none() { - let file = VirtualFile::open(&path) - .with_context(|| format!("Failed to open file '{}'", path.display()))?; - inner.file = Some(FileBlockReader::new(file)); - } - let file = inner.file.as_mut().unwrap(); + let file = VirtualFile::open(&path) + .with_context(|| format!("Failed to open file '{}'", path.display()))?; + let file = FileBlockReader::new(file); let summary_blk = file.read_blk(0)?; let actual_summary = Summary::des_prefix(summary_blk.as_ref())?; @@ -395,10 +365,11 @@ impl ImageLayer { } } - inner.index_start_blk = actual_summary.index_start_blk; - inner.index_root_blk = actual_summary.index_root_blk; - inner.loaded = true; - Ok(()) + Ok(ImageLayerInner { + index_start_blk: actual_summary.index_start_blk, + index_root_blk: actual_summary.index_root_blk, + file, + }) } /// Create an ImageLayer struct representing an existing file on disk @@ -422,12 +393,7 @@ impl ImageLayer { ), // Now we assume image layer ALWAYS covers the full range. This may change in the future. lsn: filename.lsn, access_stats, - inner: RwLock::new(ImageLayerInner { - loaded: false, - file: None, - index_start_blk: 0, - index_root_blk: 0, - }), + inner: OnceCell::new(), } } @@ -454,12 +420,7 @@ impl ImageLayer { ), // Now we assume image layer ALWAYS covers the full range. This may change in the future. lsn: summary.lsn, access_stats: LayerAccessStats::empty_will_record_residence_event_later(), - inner: RwLock::new(ImageLayerInner { - file: None, - loaded: false, - index_start_blk: 0, - index_root_blk: 0, - }), + inner: OnceCell::new(), }) } @@ -620,12 +581,7 @@ impl ImageLayerWriterInner { desc, lsn: self.lsn, access_stats: LayerAccessStats::empty_will_record_residence_event_later(), - inner: RwLock::new(ImageLayerInner { - loaded: false, - file: None, - index_start_blk, - index_root_blk, - }), + inner: OnceCell::new(), }; // fsync the file diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c663a4f9ad..34211fb714 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1,3 +1,4 @@ +pub mod delete; mod eviction_task; pub mod layer_manager; mod logical_size; @@ -79,6 +80,7 @@ use crate::METADATA_FILE_NAME; use crate::ZERO_PAGE; use crate::{is_temporary, task_mgr}; +use self::delete::DeleteTimelineFlow; pub(super) use self::eviction_task::EvictionTaskTenantState; use self::eviction_task::EvictionTaskTimelineState; use self::layer_manager::LayerManager; @@ -237,11 +239,10 @@ pub struct Timeline { /// Layer removal lock. /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks. - /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`], - /// and [`Tenant::delete_timeline`]. This is an `Arc` lock because we need an owned + /// This lock is acquired in [`Timeline::gc`] and [`Timeline::compact`]. + /// This is an `Arc` lock because we need an owned /// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`). - /// - /// [`Tenant::delete_timeline`]: super::Tenant::delete_timeline + /// Note that [`DeleteTimelineFlow`] uses `delete_progress` field. pub(super) layer_removal_cs: Arc>, // Needed to ensure that we can't create a branch at a point that was already garbage collected @@ -283,7 +284,7 @@ pub struct Timeline { /// Prevent two tasks from deleting the timeline at the same time. If held, the /// timeline is being deleted. If 'true', the timeline has already been deleted. - pub delete_lock: Arc>, + pub delete_progress: Arc>, eviction_task_timeline_state: tokio::sync::Mutex, @@ -293,6 +294,10 @@ pub struct Timeline { /// Completion shared between all timelines loaded during startup; used to delay heavier /// background tasks until some logical sizes have been calculated. initial_logical_size_attempt: Mutex>, + + /// Load or creation time information about the disk_consistent_lsn and when the loading + /// happened. Used for consumption metrics. + pub(crate) loaded_at: (Lsn, SystemTime), } pub struct WalReceiverInfo { @@ -1360,9 +1365,10 @@ impl Timeline { pg_version: u32, initial_logical_size_can_start: Option, initial_logical_size_attempt: Option, + state: TimelineState, ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); - let (state, _) = watch::channel(TimelineState::Loading); + let (state, _) = watch::channel(state); let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0); let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(()))); @@ -1402,6 +1408,8 @@ impl Timeline { last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0), last_freeze_ts: RwLock::new(Instant::now()), + loaded_at: (disk_consistent_lsn, SystemTime::now()), + ancestor_timeline: ancestor, ancestor_lsn: metadata.ancestor_lsn(), @@ -1453,7 +1461,7 @@ impl Timeline { eviction_task_timeline_state: tokio::sync::Mutex::new( EvictionTaskTimelineState::default(), ), - delete_lock: Arc::new(tokio::sync::Mutex::new(false)), + delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())), initial_logical_size_can_start, initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt), @@ -1598,7 +1606,7 @@ impl Timeline { if let Some(imgfilename) = ImageFileName::parse_str(&fname) { // create an ImageLayer struct for each image file. if imgfilename.lsn > disk_consistent_lsn { - warn!( + info!( "found future image layer {} on timeline {} disk_consistent_lsn is {}", imgfilename, self.timeline_id, disk_consistent_lsn ); @@ -1630,7 +1638,7 @@ impl Timeline { // is 102, then it might not have been fully flushed to disk // before crash. if deltafilename.lsn_range.end > disk_consistent_lsn + 1 { - warn!( + info!( "found future delta layer {} on timeline {} disk_consistent_lsn is {}", deltafilename, self.timeline_id, disk_consistent_lsn ); @@ -1772,7 +1780,7 @@ impl Timeline { match remote_layer_name { LayerFileName::Image(imgfilename) => { if imgfilename.lsn > up_to_date_disk_consistent_lsn { - warn!( + info!( "found future image layer {} on timeline {} remote_consistent_lsn is {}", imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn ); @@ -1797,7 +1805,7 @@ impl Timeline { // is 102, then it might not have been fully flushed to disk // before crash. if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 { - warn!( + info!( "found future delta layer {} on timeline {} remote_consistent_lsn is {}", deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn ); @@ -1918,6 +1926,15 @@ impl Timeline { } fn try_spawn_size_init_task(self: &Arc, lsn: Lsn, ctx: &RequestContext) { + let state = self.current_state(); + if matches!( + state, + TimelineState::Broken { .. } | TimelineState::Stopping + ) { + // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken). + return; + } + let permit = match Arc::clone(&self.current_logical_size.initial_size_computation) .try_acquire_owned() { diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs new file mode 100644 index 0000000000..d99ca2d292 --- /dev/null +++ b/pageserver/src/tenant/timeline/delete.rs @@ -0,0 +1,576 @@ +use std::{ + ops::{Deref, DerefMut}, + sync::Arc, +}; + +use anyhow::Context; +use pageserver_api::models::TimelineState; +use tokio::sync::OwnedMutexGuard; +use tracing::{debug, error, info, instrument, warn, Instrument, Span}; +use utils::{ + crashsafe, fs_ext, + id::{TenantId, TimelineId}, +}; + +use crate::{ + config::PageServerConf, + task_mgr::{self, TaskKind}, + tenant::{ + metadata::TimelineMetadata, + remote_timeline_client::{ + self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient, + }, + CreateTimelineCause, DeleteTimelineError, Tenant, + }, + InitializationOrder, +}; + +use super::Timeline; + +/// Now that the Timeline is in Stopping state, request all the related tasks to shut down. +async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> { + // Stop the walreceiver first. + debug!("waiting for wal receiver to shutdown"); + let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() }; + if let Some(walreceiver) = maybe_started_walreceiver { + walreceiver.stop().await; + } + debug!("wal receiver shutdown confirmed"); + + // Prevent new uploads from starting. + if let Some(remote_client) = timeline.remote_client.as_ref() { + let res = remote_client.stop(); + match res { + Ok(()) => {} + Err(e) => match e { + remote_timeline_client::StopError::QueueUninitialized => { + // This case shouldn't happen currently because the + // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart. + // That is, before we declare the Tenant as Active. + // But we only allow calls to delete_timeline on Active tenants. + return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs"))); + } + }, + } + } + + // Stop & wait for the remaining timeline tasks, including upload tasks. + // NB: This and other delete_timeline calls do not run as a task_mgr task, + // so, they are not affected by this shutdown_tasks() call. + info!("waiting for timeline tasks to shutdown"); + task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await; + + fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { + Err(anyhow::anyhow!( + "failpoint: timeline-delete-before-index-deleted-at" + ))? + }); + Ok(()) +} + +/// Mark timeline as deleted in S3 so we won't pick it up next time +/// during attach or pageserver restart. +/// See comment in persist_index_part_with_deleted_flag. +async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> { + if let Some(remote_client) = timeline.remote_client.as_ref() { + match remote_client.persist_index_part_with_deleted_flag().await { + // If we (now, or already) marked it successfully as deleted, we can proceed + Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), + // Bail out otherwise + // + // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents + // two tasks from performing the deletion at the same time. The first task + // that starts deletion should run it to completion. + Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) + | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { + return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); + } + } + } + Ok(()) +} + +// We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued. +// This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json +// gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote" +// and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file. +// After index part is deleted presence of this mark file indentifies that it was a deletion intention. +// So we can just remove the mark file. +async fn create_delete_mark( + conf: &PageServerConf, + tenant_id: TenantId, + timeline_id: TimelineId, +) -> Result<(), DeleteTimelineError> { + fail::fail_point!("timeline-delete-before-delete-mark", |_| { + Err(anyhow::anyhow!( + "failpoint: timeline-delete-before-delete-mark" + ))? + }); + let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id); + + // Note: we're ok to replace existing file. + let _ = std::fs::OpenOptions::new() + .write(true) + .create(true) + .open(&marker_path) + .with_context(|| format!("could not create delete marker file {marker_path:?}"))?; + + crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?; + Ok(()) +} + +/// Grab the layer_removal_cs lock, and actually perform the deletion. +/// +/// This lock prevents prevents GC or compaction from running at the same time. +/// The GC task doesn't register itself with the timeline it's operating on, +/// so it might still be running even though we called `shutdown_tasks`. +/// +/// Note that there are still other race conditions between +/// GC, compaction and timeline deletion. See +/// +/// +/// No timeout here, GC & Compaction should be responsive to the +/// `TimelineState::Stopping` change. +async fn delete_local_layer_files( + conf: &PageServerConf, + tenant_id: TenantId, + timeline: &Timeline, +) -> anyhow::Result<()> { + info!("waiting for layer_removal_cs.lock()"); + let layer_removal_guard = timeline.layer_removal_cs.lock().await; + info!("got layer_removal_cs.lock(), deleting layer files"); + + // NB: storage_sync upload tasks that reference these layers have been cancelled + // by the caller. + + let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id); + + fail::fail_point!("timeline-delete-before-rm", |_| { + Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))? + }); + + // NB: This need not be atomic because the deleted flag in the IndexPart + // will be observed during tenant/timeline load. The deletion will be resumed there. + // + // For configurations without remote storage, we guarantee crash-safety by persising delete mark file. + // + // Note that here we do not bail out on std::io::ErrorKind::NotFound. + // This can happen if we're called a second time, e.g., + // because of a previous failure/cancellation at/after + // failpoint timeline-delete-after-rm. + // + // It can also happen if we race with tenant detach, because, + // it doesn't grab the layer_removal_cs lock. + // + // For now, log and continue. + // warn! level is technically not appropriate for the + // first case because we should expect retries to happen. + // But the error is so rare, it seems better to get attention if it happens. + // + // Note that metadata removal is skipped, this is not technically needed, + // but allows to reuse timeline loading code during resumed deletion. + // (we always expect that metadata is in place when timeline is being loaded) + + #[cfg(feature = "testing")] + let mut counter = 0; + + // Timeline directory may not exist if we failed to delete mark file and request was retried. + if !local_timeline_directory.exists() { + return Ok(()); + } + + let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id); + + for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) { + #[cfg(feature = "testing")] + { + counter += 1; + if counter == 2 { + fail::fail_point!("timeline-delete-during-rm", |_| { + Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))? + }); + } + } + + let entry = entry?; + if entry.path() == metadata_path { + debug!("found metadata, skipping"); + continue; + } + + if entry.path() == local_timeline_directory { + // Keeping directory because metedata file is still there + debug!("found timeline dir itself, skipping"); + continue; + } + + let metadata = match entry.metadata() { + Ok(metadata) => metadata, + Err(e) => { + if crate::is_walkdir_io_not_found(&e) { + warn!( + timeline_dir=?local_timeline_directory, + path=?entry.path().display(), + "got not found err while removing timeline dir, proceeding anyway" + ); + continue; + } + anyhow::bail!(e); + } + }; + + let r = if metadata.is_dir() { + // There shouldnt be any directories inside timeline dir as of current layout. + tokio::fs::remove_dir(entry.path()).await + } else { + tokio::fs::remove_file(entry.path()).await + }; + + if let Err(e) = r { + if e.kind() == std::io::ErrorKind::NotFound { + warn!( + timeline_dir=?local_timeline_directory, + path=?entry.path().display(), + "got not found err while removing timeline dir, proceeding anyway" + ); + continue; + } + anyhow::bail!(anyhow::anyhow!( + "Failed to remove: {}. Error: {e}", + entry.path().display() + )); + } + } + + info!("finished deleting layer files, releasing layer_removal_cs.lock()"); + drop(layer_removal_guard); + + fail::fail_point!("timeline-delete-after-rm", |_| { + Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))? + }); + + Ok(()) +} + +/// Removes remote layers and an index file after them. +async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> { + if let Some(remote_client) = &timeline.remote_client { + remote_client.delete_all().await.context("delete_all")? + }; + + Ok(()) +} + +// This function removs remaining traces of a timeline on disk. +// Namely: metadata file, timeline directory, delete mark. +// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir. +// delete mark should be present because it is the last step during deletion. +// (nothing can fail after its deletion) +async fn cleanup_remaining_timeline_fs_traces( + conf: &PageServerConf, + tenant_id: TenantId, + timeline_id: TimelineId, +) -> anyhow::Result<()> { + // Remove local metadata + tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id)) + .await + .or_else(fs_ext::ignore_not_found) + .context("remove metadata")?; + + fail::fail_point!("timeline-delete-after-rm-metadata", |_| { + Err(anyhow::anyhow!( + "failpoint: timeline-delete-after-rm-metadata" + ))? + }); + + // Remove timeline dir + tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id)) + .await + .or_else(fs_ext::ignore_not_found) + .context("timeline dir")?; + + fail::fail_point!("timeline-delete-after-rm-dir", |_| { + Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))? + }); + + // Remove delete mark + tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id)) + .await + .context("remove delete mark") +} + +/// It is important that this gets called when DeletionGuard is being held. +/// For more context see comments in [`DeleteTimelineFlow::prepare`] +async fn remove_timeline_from_tenant( + tenant: &Tenant, + timeline_id: TimelineId, + _: &DeletionGuard, // using it as a witness +) -> anyhow::Result<()> { + // Remove the timeline from the map. + let mut timelines = tenant.timelines.lock().unwrap(); + let children_exist = timelines + .iter() + .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); + // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`. + // We already deleted the layer files, so it's probably best to panic. + // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart) + if children_exist { + panic!("Timeline grew children while we removed layer files"); + } + + timelines + .remove(&timeline_id) + .expect("timeline that we were deleting was concurrently removed from 'timelines' map"); + + drop(timelines); + + Ok(()) +} + +/// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures, +/// and deletes its data from both disk and s3. +/// The sequence of steps: +/// 1. Set deleted_at in remote index part. +/// 2. Create local mark file. +/// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata) +/// 4. Delete remote layers +/// 5. Delete index part +/// 6. Delete meta, timeline directory +/// 7. Delete mark file +/// It is resumable from any step in case a crash/restart occurs. +/// There are three entrypoints to the process: +/// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler. +/// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present +/// and we possibly neeed to continue deletion of remote files. +/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote +/// index but still have local metadata, timeline directory and delete mark. +/// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load. +#[derive(Default)] +pub enum DeleteTimelineFlow { + #[default] + NotStarted, + InProgress, + Finished, +} + +impl DeleteTimelineFlow { + // These steps are run in the context of management api request handler. + // Long running steps are continued to run in the background. + // NB: If this fails half-way through, and is retried, the retry will go through + // all the same steps again. Make sure the code here is idempotent, and don't + // error out if some of the shutdown tasks have already been completed! + #[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))] + pub async fn run( + tenant: &Arc, + timeline_id: TimelineId, + ) -> Result<(), DeleteTimelineError> { + let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?; + + guard.mark_in_progress()?; + + stop_tasks(&timeline).await?; + + set_deleted_in_remote_index(&timeline).await?; + + create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?; + + fail::fail_point!("timeline-delete-before-schedule", |_| { + Err(anyhow::anyhow!( + "failpoint: timeline-delete-before-schedule" + ))? + }); + + Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline); + + Ok(()) + } + + fn mark_in_progress(&mut self) -> anyhow::Result<()> { + match self { + Self::Finished => anyhow::bail!("Bug. Is in finished state"), + Self::InProgress { .. } => { /* We're in a retry */ } + Self::NotStarted => { /* Fresh start */ } + } + + *self = Self::InProgress; + + Ok(()) + } + + /// Shortcut to create Timeline in stopping state and spawn deletion task. + pub async fn resume_deletion( + tenant: Arc, + timeline_id: TimelineId, + local_metadata: &TimelineMetadata, + remote_client: Option, + init_order: Option<&InitializationOrder>, + ) -> anyhow::Result<()> { + // Note: here we even skip populating layer map. Timeline is essentially uninitialized. + // RemoteTimelineClient is the only functioning part. + let timeline = tenant + .create_timeline_struct( + timeline_id, + local_metadata, + None, // Ancestor is not needed for deletion. + remote_client, + init_order, + // Important. We dont pass ancestor above because it can be missing. + // Thus we need to skip the validation here. + CreateTimelineCause::Delete, + ) + .context("create_timeline_struct")?; + + let mut guard = DeletionGuard( + Arc::clone(&timeline.delete_progress) + .try_lock_owned() + .expect("cannot happen because we're the only owner"), + ); + + // We meed to do this because when console retries delete request we shouldnt answer with 404 + // because 404 means successful deletion. + { + let mut locked = tenant.timelines.lock().unwrap(); + locked.insert(timeline_id, Arc::clone(&timeline)); + } + + guard.mark_in_progress()?; + + // Note that delete mark can be missing on resume + // because we create delete mark after we set deleted_at in the index part. + create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?; + + Self::schedule_background(guard, tenant.conf, tenant, timeline); + + Ok(()) + } + + pub async fn cleanup_remaining_timeline_fs_traces( + tenant: &Tenant, + timeline_id: TimelineId, + ) -> anyhow::Result<()> { + cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await + } + + fn prepare( + tenant: &Tenant, + timeline_id: TimelineId, + ) -> Result<(Arc, DeletionGuard), DeleteTimelineError> { + // Note the interaction between this guard and deletion guard. + // Here we attempt to lock deletion guard when we're holding a lock on timelines. + // This is important because when you take into account `remove_timeline_from_tenant` + // we remove timeline from memory when we still hold the deletion guard. + // So here when timeline deletion is finished timeline wont be present in timelines map at all + // which makes the following sequence impossible: + // T1: get preempted right before the try_lock on `Timeline::delete_progress` + // T2: do a full deletion, acquire and drop `Timeline::delete_progress` + // T1: acquire deletion lock, do another `DeleteTimelineFlow::run` + // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346` + let timelines = tenant.timelines.lock().unwrap(); + + let timeline = match timelines.get(&timeline_id) { + Some(t) => t, + None => return Err(DeleteTimelineError::NotFound), + }; + + // Ensure that there are no child timelines **attached to that pageserver**, + // because detach removes files, which will break child branches + let children: Vec = timelines + .iter() + .filter_map(|(id, entry)| { + if entry.get_ancestor_timeline_id() == Some(timeline_id) { + Some(*id) + } else { + None + } + }) + .collect(); + + if !children.is_empty() { + return Err(DeleteTimelineError::HasChildren(children)); + } + + // Note that using try_lock here is important to avoid a deadlock. + // Here we take lock on timelines and then the deletion guard. + // At the end of the operation we're holding the guard and need to lock timelines map + // to remove the timeline from it. + // Always if you have two locks that are taken in different order this can result in a deadlock. + let delete_lock_guard = DeletionGuard( + Arc::clone(&timeline.delete_progress) + .try_lock_owned() + .map_err(|_| DeleteTimelineError::AlreadyInProgress)?, + ); + + timeline.set_state(TimelineState::Stopping); + + Ok((Arc::clone(timeline), delete_lock_guard)) + } + + fn schedule_background( + guard: DeletionGuard, + conf: &'static PageServerConf, + tenant: Arc, + timeline: Arc, + ) { + let tenant_id = timeline.tenant_id; + let timeline_id = timeline.timeline_id; + + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + TaskKind::TimelineDeletionWorker, + Some(tenant_id), + Some(timeline_id), + "timeline_delete", + false, + async move { + if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await { + error!("Error: {err:#}"); + timeline.set_broken(format!("{err:#}")) + }; + Ok(()) + } + .instrument({ + let span = + tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id); + span.follows_from(Span::current()); + span + }), + ); + } + + async fn background( + mut guard: DeletionGuard, + conf: &PageServerConf, + tenant: &Tenant, + timeline: &Timeline, + ) -> Result<(), DeleteTimelineError> { + delete_local_layer_files(conf, tenant.tenant_id, timeline).await?; + + delete_remote_layers_and_index(timeline).await?; + + pausable_failpoint!("in_progress_delete"); + + cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?; + + remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?; + + *guard.0 = Self::Finished; + + Ok(()) + } +} + +struct DeletionGuard(OwnedMutexGuard); + +impl Deref for DeletionGuard { + type Target = DeleteTimelineFlow; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for DeletionGuard { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index b8cc65f4b1..5a15e86458 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -2,13 +2,9 @@ use std::{collections::hash_map::Entry, fs, path::PathBuf, sync::Arc}; use anyhow::Context; use tracing::{error, info, info_span, warn}; -use utils::{crashsafe, id::TimelineId, lsn::Lsn}; +use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn}; -use crate::{ - context::RequestContext, - import_datadir, - tenant::{ignore_absent_files, Tenant}, -}; +use crate::{context::RequestContext, import_datadir, tenant::Tenant}; use super::Timeline; @@ -141,7 +137,7 @@ impl Drop for UninitializedTimeline<'_> { pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) { let timeline_path = &uninit_mark.timeline_path; - match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) { + match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) { Ok(()) => { info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark") } @@ -185,7 +181,7 @@ impl TimelineUninitMark { let uninit_mark_parent = uninit_mark_file .parent() .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?; - ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| { + fs_ext::ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| { format!("Failed to remove uninit mark file at path {uninit_mark_file:?}") })?; crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?; diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs index 57c09a4487..2642746c79 100644 --- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs +++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs @@ -1123,7 +1123,7 @@ mod tests { } #[tokio::test] - async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { + async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> { let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?; let mut state = dummy_state(&harness).await; let current_lsn = Lsn(100_000).align(); @@ -1189,8 +1189,8 @@ mod tests { } #[tokio::test] - async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?; + async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> { + let harness = TenantHarness::create("timeout_connection_threshold_current_candidate")?; let mut state = dummy_state(&harness).await; let current_lsn = Lsn(100_000).align(); let now = Utc::now().naive_utc(); @@ -1252,8 +1252,8 @@ mod tests { } #[tokio::test] - async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> { - let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?; + async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> { + let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate")?; let mut state = dummy_state(&harness).await; let current_lsn = Lsn(100_000).align(); let new_lsn = Lsn(100_100).align(); diff --git a/pgxn/neon/libpqwalproposer.c b/pgxn/neon/libpqwalproposer.c index 9b6175a621..ed3b8a817f 100644 --- a/pgxn/neon/libpqwalproposer.c +++ b/pgxn/neon/libpqwalproposer.c @@ -292,7 +292,7 @@ walprop_async_read(WalProposerConn *conn, char **buf, int *amount) /* * The docs for PQgetCopyData list the return values as: 0 if the copy is * still in progress, but no "complete row" is available -1 if the copy is - * done -2 if an error occured (> 0) if it was successful; that value is + * done -2 if an error occurred (> 0) if it was successful; that value is * the amount transferred. * * The protocol we use between walproposer and safekeeper means that we @@ -353,7 +353,7 @@ walprop_async_write(WalProposerConn *conn, void const *buf, size_t size) /* * The docs for PQputcopyData list the return values as: 1 if the data was * queued, 0 if it was not queued because of full buffers, or -1 if an - * error occured + * error occurred */ result = PQputCopyData(conn->pg_conn, buf, size); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 765966092d..807fd5c91b 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -788,7 +788,7 @@ ReconnectSafekeepers(void) /* * Performs the logic for advancing the state machine of the specified safekeeper, - * given that a certain set of events has occured. + * given that a certain set of events has occurred. */ static void AdvancePollState(Safekeeper *sk, uint32 events) diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h index f016a229eb..615fbf9399 100644 --- a/pgxn/neon/walproposer.h +++ b/pgxn/neon/walproposer.h @@ -23,7 +23,7 @@ * message header */ /* - * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured, + * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occurred, * because all WL_* events are given flags equal to some (1 << i), starting from i = 0 */ #define WL_NO_EVENTS 0 @@ -317,7 +317,7 @@ typedef struct AppendResponse /* this is a criterion for walproposer --sync mode exit */ XLogRecPtr commitLsn; HotStandbyFeedback hs; - /* Feedback recieved from pageserver includes standby_status_update fields */ + /* Feedback received from pageserver includes standby_status_update fields */ /* and custom neon feedback. */ /* This part of the message is extensible. */ PageserverFeedback rf; diff --git a/poetry.lock b/poetry.lock index a8ea410734..74cec84a40 100644 --- a/poetry.lock +++ b/poetry.lock @@ -740,13 +740,13 @@ typing-extensions = ">=4.1.0" [[package]] name = "certifi" -version = "2022.12.7" +version = "2023.7.22" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2022.12.7-py3-none-any.whl", hash = "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"}, - {file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"}, + {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"}, + {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"}, ] [[package]] diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 9322e4f9ff..ff73f2b625 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -53,6 +53,12 @@ pub enum BackendType<'a, T> { Postgres(Cow<'a, console::provider::mock::Api>, T), /// Authentication via a web browser. Link(Cow<'a, url::ApiUrl>), + /// Test backend. + Test(&'a dyn TestBackend), +} + +pub trait TestBackend: Send + Sync + 'static { + fn wake_compute(&self) -> Result; } impl std::fmt::Display for BackendType<'_, ()> { @@ -62,6 +68,7 @@ impl std::fmt::Display for BackendType<'_, ()> { Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(), Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(), Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(), + Test(_) => fmt.debug_tuple("Test").finish(), } } } @@ -75,6 +82,7 @@ impl BackendType<'_, T> { Console(c, x) => Console(Cow::Borrowed(c), x), Postgres(c, x) => Postgres(Cow::Borrowed(c), x), Link(c) => Link(Cow::Borrowed(c)), + Test(x) => Test(*x), } } } @@ -89,6 +97,7 @@ impl<'a, T> BackendType<'a, T> { Console(c, x) => Console(c, f(x)), Postgres(c, x) => Postgres(c, f(x)), Link(c) => Link(c), + Test(x) => Test(x), } } } @@ -102,6 +111,7 @@ impl<'a, T, E> BackendType<'a, Result> { Console(c, x) => x.map(|x| Console(c, x)), Postgres(c, x) => x.map(|x| Postgres(c, x)), Link(c) => Ok(Link(c)), + Test(x) => Ok(Test(x)), } } } @@ -147,6 +157,7 @@ impl BackendType<'_, ClientCredentials<'_>> { Console(_, creds) => creds.project.clone(), Postgres(_, creds) => creds.project.clone(), Link(_) => Some("link".to_owned()), + Test(_) => Some("test".to_owned()), } } /// Authenticate the client via the requested backend, possibly using credentials. @@ -188,6 +199,9 @@ impl BackendType<'_, ClientCredentials<'_>> { .await? .map(CachedNodeInfo::new_uncached) } + Test(_) => { + unreachable!("this function should never be called in the test backend") + } }; info!("user successfully authenticated"); @@ -206,6 +220,7 @@ impl BackendType<'_, ClientCredentials<'_>> { Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await, Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await, Link(_) => Ok(None), + Test(x) => x.wake_compute().map(Some), } } } diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index 6753e7ed7f..5ed0f747c2 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -1,8 +1,11 @@ +use std::ops::ControlFlow; + use super::AuthSuccess; use crate::{ auth::{self, AuthFlow, ClientCredentials}, compute, console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra}, + proxy::handle_try_wake, sasl, scram, stream::PqStream, }; @@ -48,7 +51,16 @@ pub(super) async fn authenticate( } }; - let mut node = api.wake_compute(extra, creds).await?; + info!("compute node's state has likely changed; requesting a wake-up"); + let mut num_retries = 0; + let mut node = loop { + let wake_res = api.wake_compute(extra, creds).await; + match handle_try_wake(wake_res, num_retries)? { + ControlFlow::Continue(_) => num_retries += 1, + ControlFlow::Break(n) => break n, + } + info!(num_retries, "retrying wake compute"); + }; if let Some(keys) = scram_keys { use tokio_postgres::config::AuthKeys; node.config.auth_keys(AuthKeys::ScramSha256(keys)); diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index 3eaed1b82b..37190c76b8 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -14,6 +14,7 @@ pub mod errors { use crate::{ error::{io_error, UserFacingError}, http, + proxy::ShouldRetry, }; use thiserror::Error; @@ -72,6 +73,24 @@ pub mod errors { } } + impl ShouldRetry for ApiError { + fn could_retry(&self) -> bool { + match self { + // retry some transport errors + Self::Transport(io) => io.could_retry(), + // retry some temporary failures because the compute was in a bad state + // (bad request can be returned when the endpoint was in transition) + Self::Console { + status: http::StatusCode::BAD_REQUEST | http::StatusCode::LOCKED, + .. + } => true, + // retry server errors + Self::Console { status, .. } if status.is_server_error() => true, + _ => false, + } + } + } + impl From for ApiError { fn from(e: reqwest::Error) -> Self { io_error(e).into() diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs index adf7252f72..a4eb13a10b 100644 --- a/proxy/src/http/sql_over_http.rs +++ b/proxy/src/http/sql_over_http.rs @@ -1,7 +1,9 @@ use std::sync::Arc; +use anyhow::bail; use futures::pin_mut; use futures::StreamExt; +use hashbrown::HashMap; use hyper::body::HttpBody; use hyper::http::HeaderName; use hyper::http::HeaderValue; @@ -11,6 +13,8 @@ use serde_json::Map; use serde_json::Value; use tokio_postgres::types::Kind; use tokio_postgres::types::Type; +use tokio_postgres::GenericClient; +use tokio_postgres::IsolationLevel; use tokio_postgres::Row; use url::Url; @@ -23,12 +27,21 @@ struct QueryData { params: Vec, } +#[derive(serde::Deserialize)] +#[serde(untagged)] +enum Payload { + Single(QueryData), + Batch(Vec), +} + pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output"); static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode"); static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in"); +static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level"); +static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only"); static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true"); @@ -162,7 +175,7 @@ pub async fn handle( request: Request, sni_hostname: Option, conn_pool: Arc, -) -> anyhow::Result { +) -> anyhow::Result<(Value, HashMap)> { // // Determine the destination and connection params // @@ -177,6 +190,23 @@ pub async fn handle( // Allow connection pooling only if explicitly requested let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE); + // isolation level and read only + + let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned(); + let txn_isolation_level = match txn_isolation_level_raw { + Some(ref x) => Some(match x.as_bytes() { + b"Serializable" => IsolationLevel::Serializable, + b"ReadUncommitted" => IsolationLevel::ReadUncommitted, + b"ReadCommitted" => IsolationLevel::ReadCommitted, + b"RepeatableRead" => IsolationLevel::RepeatableRead, + _ => bail!("invalid isolation level"), + }), + None => None, + }; + + let txn_read_only_raw = headers.get(&TXN_READ_ONLY).cloned(); + let txn_read_only = txn_read_only_raw.as_ref() == Some(&HEADER_VALUE_TRUE); + let request_content_length = match request.body().size_hint().upper() { Some(v) => v, None => MAX_REQUEST_SIZE + 1, @@ -192,15 +222,70 @@ pub async fn handle( // Read the query and query params from the request body // let body = hyper::body::to_bytes(request.into_body()).await?; - let QueryData { query, params } = serde_json::from_slice(&body)?; - let query_params = json_to_pg_text(params)?; + let payload: Payload = serde_json::from_slice(&body)?; + + let mut client = conn_pool.get(&conn_info, !allow_pool).await?; // // Now execute the query and return the result // - let client = conn_pool.get(&conn_info, !allow_pool).await?; + let result = match payload { + Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode) + .await + .map(|x| (x, HashMap::default())), + Payload::Batch(queries) => { + let mut results = Vec::new(); + let mut builder = client.build_transaction(); + if let Some(isolation_level) = txn_isolation_level { + builder = builder.isolation_level(isolation_level); + } + if txn_read_only { + builder = builder.read_only(true); + } + let transaction = builder.start().await?; + for query in queries { + let result = query_to_json(&transaction, query, raw_output, array_mode).await; + match result { + Ok(r) => results.push(r), + Err(e) => { + transaction.rollback().await?; + return Err(e); + } + } + } + transaction.commit().await?; + let mut headers = HashMap::default(); + headers.insert( + TXN_READ_ONLY.clone(), + HeaderValue::try_from(txn_read_only.to_string())?, + ); + if let Some(txn_isolation_level_raw) = txn_isolation_level_raw { + headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level_raw); + } + Ok((json!({ "results": results }), headers)) + } + }; - let row_stream = client.query_raw_txt(query, query_params).await?; + if allow_pool { + // return connection to the pool + tokio::task::spawn(async move { + let _ = conn_pool.put(&conn_info, client).await; + }); + } + + result +} + +async fn query_to_json( + client: &T, + data: QueryData, + raw_output: bool, + array_mode: bool, +) -> anyhow::Result { + let query_params = json_to_pg_text(data.params)?; + let row_stream = client + .query_raw_txt::(data.query, query_params) + .await?; // Manually drain the stream into a vector to leave row_stream hanging // around to get a command tag. Also check that the response is not too @@ -256,13 +341,6 @@ pub async fn handle( .map(|row| pg_text_row_to_json(row, raw_output, array_mode)) .collect::, _>>()?; - if allow_pool { - // return connection to the pool - tokio::task::spawn(async move { - let _ = conn_pool.put(&conn_info, client).await; - }); - } - // resulting JSON format is based on the format of node-postgres result Ok(json!({ "command": command_tag_name, diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs index 5b7a87bc11..fec76c74f4 100644 --- a/proxy/src/http/websocket.rs +++ b/proxy/src/http/websocket.rs @@ -6,6 +6,7 @@ use crate::{ }; use bytes::{Buf, Bytes}; use futures::{Sink, Stream, StreamExt}; +use hashbrown::HashMap; use hyper::{ server::{ accept, @@ -181,13 +182,15 @@ async fn ws_handler( // Check if the request is a websocket upgrade request. if hyper_tungstenite::is_upgrade_request(&request) { + info!(session_id = ?session_id, "performing websocket upgrade"); + let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None) .map_err(|e| ApiError::BadRequest(e.into()))?; tokio::spawn(async move { if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await { - error!("error in websocket connection: {e:?}"); + error!(session_id = ?session_id, "error in websocket connection: {e:?}"); } }); @@ -203,7 +206,7 @@ async fn ws_handler( Ok(_) => StatusCode::OK, Err(_) => StatusCode::BAD_REQUEST, }; - let json = match result { + let (json, headers) = match result { Ok(r) => r, Err(e) => { let message = format!("{:?}", e); @@ -214,7 +217,10 @@ async fn ws_handler( }, None => Value::Null, }; - json!({ "message": message, "code": code }) + ( + json!({ "message": message, "code": code }), + HashMap::default(), + ) } }; json_response(status_code, json).map(|mut r| { @@ -222,6 +228,9 @@ async fn ws_handler( "Access-Control-Allow-Origin", hyper::http::HeaderValue::from_static("*"), ); + for (k, v) in headers { + r.headers_mut().insert(k, v); + } r }) } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS { diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index d317d382a7..5f59957b2d 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -6,18 +6,15 @@ use crate::{ cancellation::{self, CancelMap}, compute::{self, PostgresConnection}, config::{ProxyConfig, TlsConfig}, - console::{ - self, - errors::{ApiError, WakeComputeError}, - messages::MetricsAuxInfo, - }, + console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api}, stream::{PqStream, Stream}, }; use anyhow::{bail, Context}; use async_trait::async_trait; use futures::TryFutureExt; -use hyper::StatusCode; -use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; +use metrics::{ + exponential_buckets, register_histogram, register_int_counter_vec, Histogram, IntCounterVec, +}; use once_cell::sync::Lazy; use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; use std::{error::Error, io, ops::ControlFlow, sync::Arc}; @@ -31,25 +28,37 @@ use utils::measured_stream::MeasuredStream; /// Number of times we should retry the `/proxy_wake_compute` http request. /// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n -const NUM_RETRIES_CONNECT: u32 = 10; +pub const NUM_RETRIES_CONNECT: u32 = 10; const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100); const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; const ERR_PROTO_VIOLATION: &str = "protocol violation"; -static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter!( +static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( "proxy_accepted_connections_total", - "Number of TCP client connections accepted." + "Number of TCP client connections accepted.", + &["protocol"], ) .unwrap() }); -static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter!( +static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( "proxy_closed_connections_total", - "Number of TCP client connections closed." + "Number of TCP client connections closed.", + &["protocol"], + ) + .unwrap() +}); + +static COMPUTE_CONNECTION_LATENCY: Lazy = Lazy::new(|| { + register_histogram!( + "proxy_compute_connection_latency_seconds", + "Time it took for proxy to establish a connection to the compute endpoint", + // largest bucket = 2^16 * 0.5ms = 32s + exponential_buckets(0.0005, 2.0, 16).unwrap(), ) .unwrap() }); @@ -137,6 +146,13 @@ pub enum ClientMode { /// Abstracts the logic of handling TCP vs WS clients impl ClientMode { + fn protocol_label(&self) -> &'static str { + match self { + ClientMode::Tcp => "tcp", + ClientMode::Websockets { .. } => "ws", + } + } + fn allow_cleartext(&self) -> bool { match self { ClientMode::Tcp => false, @@ -175,10 +191,17 @@ pub async fn handle_client( stream: S, mode: ClientMode, ) -> anyhow::Result<()> { + info!( + protocol = mode.protocol_label(), + "handling interactive connection from client" + ); + // The `closed` counter will increase when this future is destroyed. - NUM_CONNECTIONS_ACCEPTED_COUNTER.inc(); + NUM_CONNECTIONS_ACCEPTED_COUNTER + .with_label_values(&[mode.protocol_label()]) + .inc(); scopeguard::defer! { - NUM_CONNECTIONS_CLOSED_COUNTER.inc(); + NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[mode.protocol_label()]).inc(); } let tls = config.tls_config.as_ref(); @@ -324,11 +347,6 @@ async fn connect_to_compute_once( .await } -enum ConnectionState { - Cached(console::CachedNodeInfo), - Invalid(compute::ConnCfg, E), -} - #[async_trait] pub trait ConnectMechanism { type Connection; @@ -380,88 +398,91 @@ where M::ConnectError: ShouldRetry + std::fmt::Debug, M::Error: From, { + let _timer = COMPUTE_CONNECTION_LATENCY.start_timer(); + mechanism.update_connect_config(&mut node_info.config); - let mut num_retries = 0; - let mut state = ConnectionState::::Cached(node_info); + // try once + let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await { + Ok(res) => return Ok(res), + Err(e) => { + error!(error = ?e, "could not connect to compute node"); + (invalidate_cache(node_info), e) + } + }; - loop { - match state { - ConnectionState::Invalid(config, err) => { - match try_wake(&config, extra, creds).await { - // we can't wake up the compute node - Ok(None) => return Err(err.into()), - // there was an error communicating with the control plane - Err(e) => return Err(e.into()), - // failed to wake up but we can continue to retry - Ok(Some(ControlFlow::Continue(()))) => { - state = ConnectionState::Invalid(config, err); - let wait_duration = retry_after(num_retries); - num_retries += 1; + let mut num_retries = 1; - info!(num_retries, "retrying wake compute"); - time::sleep(wait_duration).await; - continue; - } - // successfully woke up a compute node and can break the wakeup loop - Ok(Some(ControlFlow::Break(mut node_info))) => { - mechanism.update_connect_config(&mut node_info.config); - state = ConnectionState::Cached(node_info) - } - } + // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node + info!("compute node's state has likely changed; requesting a wake-up"); + let node_info = loop { + let wake_res = match creds { + auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await, + auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await, + // nothing to do? + auth::BackendType::Link(_) => return Err(err.into()), + // test backend + auth::BackendType::Test(x) => x.wake_compute(), + }; + + match handle_try_wake(wake_res, num_retries)? { + // failed to wake up but we can continue to retry + ControlFlow::Continue(_) => {} + // successfully woke up a compute node and can break the wakeup loop + ControlFlow::Break(mut node_info) => { + node_info.config.reuse_password(&config); + mechanism.update_connect_config(&mut node_info.config); + break node_info; } - ConnectionState::Cached(node_info) => { - match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await { - Ok(res) => return Ok(res), - Err(e) => { - error!(error = ?e, "could not connect to compute node"); - if !e.should_retry(num_retries) { - return Err(e.into()); - } + } - // after the first connect failure, - // we should invalidate the cache and wake up a new compute node - if num_retries == 0 { - state = ConnectionState::Invalid(invalidate_cache(node_info), e); - } else { - state = ConnectionState::Cached(node_info); - } + let wait_duration = retry_after(num_retries); + num_retries += 1; - let wait_duration = retry_after(num_retries); - num_retries += 1; + time::sleep(wait_duration).await; + info!(num_retries, "retrying wake compute"); + }; - info!(num_retries, "retrying wake compute"); - time::sleep(wait_duration).await; - } + // now that we have a new node, try connect to it repeatedly. + // this can error for a few reasons, for instance: + // * DNS connection settings haven't quite propagated yet + info!("wake_compute success. attempting to connect"); + loop { + match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await { + Ok(res) => return Ok(res), + Err(e) => { + error!(error = ?e, "could not connect to compute node"); + if !e.should_retry(num_retries) { + return Err(e.into()); } } } + + let wait_duration = retry_after(num_retries); + num_retries += 1; + + time::sleep(wait_duration).await; + info!(num_retries, "retrying connect_once"); } } /// Attempts to wake up the compute node. -/// * Returns Ok(Some(true)) if there was an error waking but retries are acceptable -/// * Returns Ok(Some(false)) if the wakeup succeeded -/// * Returns Ok(None) or Err(e) if there was an error -async fn try_wake( - config: &compute::ConnCfg, - extra: &console::ConsoleReqExtra<'_>, - creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>, -) -> Result>, WakeComputeError> { - info!("compute node's state has likely changed; requesting a wake-up"); - match creds.wake_compute(extra).await { - // retry wake if the compute was in an invalid state - Err(WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::BAD_REQUEST, - .. - })) => Ok(Some(ControlFlow::Continue(()))), - // Update `node_info` and try again. - Ok(Some(mut new)) => { - new.config.reuse_password(config); - Ok(Some(ControlFlow::Break(new))) - } - Err(e) => Err(e), - Ok(None) => Ok(None), +/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable +/// * Returns Ok(Break(node)) if the wakeup succeeded +/// * Returns Err(e) if there was an error +pub fn handle_try_wake( + result: Result, + num_retries: u32, +) -> Result, WakeComputeError> { + match result { + Err(err) => match &err { + WakeComputeError::ApiError(api) if api.should_retry(num_retries) => { + Ok(ControlFlow::Continue(err)) + } + _ => Err(err), + }, + // Ready to try again. + Ok(new) => Ok(ControlFlow::Break(new)), } } @@ -469,8 +490,6 @@ pub trait ShouldRetry { fn could_retry(&self) -> bool; fn should_retry(&self, num_retries: u32) -> bool { match self { - // retry all errors at least once - _ if num_retries == 0 => true, _ if num_retries >= NUM_RETRIES_CONNECT => false, err => err.could_retry(), } @@ -522,14 +541,9 @@ impl ShouldRetry for compute::ConnectionError { } } -pub fn retry_after(num_retries: u32) -> time::Duration { - match num_retries { - 0 => time::Duration::ZERO, - _ => { - // 3/2 = 1.5 which seems to be an ok growth factor heuristic - BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries) - } - } +fn retry_after(num_retries: u32) -> time::Duration { + // 1.5 seems to be an ok growth factor heuristic + BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32)) } /// Finish client connection initialization: confirm auth success, send params, etc. diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 565f86eecc..5653ec94dc 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -1,10 +1,10 @@ //! A group of high-level tests for connection establishing logic and auth. -use std::borrow::Cow; - +//! use super::*; +use crate::auth::backend::TestBackend; use crate::auth::ClientCredentials; use crate::console::{CachedNodeInfo, NodeInfo}; -use crate::{auth, sasl, scram}; +use crate::{auth, http, sasl, scram}; use async_trait::async_trait; use rstest::rstest; use tokio_postgres::config::SslMode; @@ -302,15 +302,18 @@ async fn scram_auth_mock() -> anyhow::Result<()> { #[test] fn connect_compute_total_wait() { let mut total_wait = tokio::time::Duration::ZERO; - for num_retries in 0..10 { + for num_retries in 1..10 { total_wait += retry_after(num_retries); } assert!(total_wait < tokio::time::Duration::from_secs(12)); assert!(total_wait > tokio::time::Duration::from_secs(10)); } -#[derive(Clone, Copy)] +#[derive(Clone, Copy, Debug)] enum ConnectAction { + Wake, + WakeFail, + WakeRetry, Connect, Retry, Fail, @@ -321,6 +324,17 @@ struct TestConnectMechanism { sequence: Vec, } +impl TestConnectMechanism { + fn verify(&self) { + let counter = self.counter.lock().unwrap(); + assert_eq!( + *counter, + self.sequence.len(), + "sequence does not proceed to the end" + ); + } +} + impl TestConnectMechanism { fn new(sequence: Vec) -> Self { Self { @@ -370,30 +384,63 @@ impl ConnectMechanism for TestConnectMechanism { ConnectAction::Connect => Ok(TestConnection), ConnectAction::Retry => Err(TestConnectError { retryable: true }), ConnectAction::Fail => Err(TestConnectError { retryable: false }), + x => panic!("expecting action {:?}, connect is called instead", x), } } fn update_connect_config(&self, _conf: &mut compute::ConnCfg) {} } -fn helper_create_connect_info() -> ( - CachedNodeInfo, - console::ConsoleReqExtra<'static>, - auth::BackendType<'static, ClientCredentials<'static>>, -) { +impl TestBackend for TestConnectMechanism { + fn wake_compute(&self) -> Result { + let mut counter = self.counter.lock().unwrap(); + let action = self.sequence[*counter]; + *counter += 1; + match action { + ConnectAction::Wake => Ok(helper_create_cached_node_info()), + ConnectAction::WakeFail => { + let err = console::errors::ApiError::Console { + status: http::StatusCode::FORBIDDEN, + text: "TEST".into(), + }; + assert!(!err.could_retry()); + Err(console::errors::WakeComputeError::ApiError(err)) + } + ConnectAction::WakeRetry => { + let err = console::errors::ApiError::Console { + status: http::StatusCode::INTERNAL_SERVER_ERROR, + text: "TEST".into(), + }; + assert!(err.could_retry()); + Err(console::errors::WakeComputeError::ApiError(err)) + } + x => panic!("expecting action {:?}, wake_compute is called instead", x), + } + } +} + +fn helper_create_cached_node_info() -> CachedNodeInfo { let node = NodeInfo { config: compute::ConnCfg::new(), aux: Default::default(), allow_self_signed_compute: false, }; - let cache = CachedNodeInfo::new_uncached(node); + CachedNodeInfo::new_uncached(node) +} + +fn helper_create_connect_info( + mechanism: &TestConnectMechanism, +) -> ( + CachedNodeInfo, + console::ConsoleReqExtra<'static>, + auth::BackendType<'_, ClientCredentials<'static>>, +) { + let cache = helper_create_cached_node_info(); let extra = console::ConsoleReqExtra { session_id: uuid::Uuid::new_v4(), application_name: Some("TEST"), }; - let url = "https://TEST_URL".parse().unwrap(); - let api = console::provider::mock::Api::new(url); - let creds = auth::BackendType::Postgres(Cow::Owned(api), ClientCredentials::new_noop()); + let creds = auth::BackendType::Test(mechanism); (cache, extra, creds) } @@ -401,42 +448,46 @@ fn helper_create_connect_info() -> ( async fn connect_to_compute_success() { use ConnectAction::*; let mechanism = TestConnectMechanism::new(vec![Connect]); - let (cache, extra, creds) = helper_create_connect_info(); + let (cache, extra, creds) = helper_create_connect_info(&mechanism); connect_to_compute(&mechanism, cache, &extra, &creds) .await .unwrap(); + mechanism.verify(); } #[tokio::test] async fn connect_to_compute_retry() { use ConnectAction::*; - let mechanism = TestConnectMechanism::new(vec![Retry, Retry, Connect]); - let (cache, extra, creds) = helper_create_connect_info(); + let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]); + let (cache, extra, creds) = helper_create_connect_info(&mechanism); connect_to_compute(&mechanism, cache, &extra, &creds) .await .unwrap(); + mechanism.verify(); } /// Test that we don't retry if the error is not retryable. #[tokio::test] async fn connect_to_compute_non_retry_1() { use ConnectAction::*; - let mechanism = TestConnectMechanism::new(vec![Retry, Retry, Fail]); - let (cache, extra, creds) = helper_create_connect_info(); + let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]); + let (cache, extra, creds) = helper_create_connect_info(&mechanism); connect_to_compute(&mechanism, cache, &extra, &creds) .await .unwrap_err(); + mechanism.verify(); } /// Even for non-retryable errors, we should retry at least once. #[tokio::test] async fn connect_to_compute_non_retry_2() { use ConnectAction::*; - let mechanism = TestConnectMechanism::new(vec![Fail, Retry, Connect]); - let (cache, extra, creds) = helper_create_connect_info(); + let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]); + let (cache, extra, creds) = helper_create_connect_info(&mechanism); connect_to_compute(&mechanism, cache, &extra, &creds) .await .unwrap(); + mechanism.verify(); } /// Retry for at most `NUM_RETRIES_CONNECT` times. @@ -445,11 +496,36 @@ async fn connect_to_compute_non_retry_3() { assert_eq!(NUM_RETRIES_CONNECT, 10); use ConnectAction::*; let mechanism = TestConnectMechanism::new(vec![ - Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, + Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, /* the 11th time */ Retry, ]); - let (cache, extra, creds) = helper_create_connect_info(); + let (cache, extra, creds) = helper_create_connect_info(&mechanism); connect_to_compute(&mechanism, cache, &extra, &creds) .await .unwrap_err(); + mechanism.verify(); +} + +/// Should retry wake compute. +#[tokio::test] +async fn wake_retry() { + use ConnectAction::*; + let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]); + let (cache, extra, creds) = helper_create_connect_info(&mechanism); + connect_to_compute(&mechanism, cache, &extra, &creds) + .await + .unwrap(); + mechanism.verify(); +} + +/// Wake failed with a non-retryable error. +#[tokio::test] +async fn wake_non_retry() { + use ConnectAction::*; + let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]); + let (cache, extra, creds) = helper_create_connect_info(&mechanism); + connect_to_compute(&mechanism, cache, &extra, &creds) + .await + .unwrap_err(); + mechanism.verify(); } diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index abede2e44d..90eebd392c 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -234,7 +234,10 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { listen_pg_addr_tenant_only ); let listener = tcp_listener::bind(listen_pg_addr_tenant_only.clone()).map_err(|e| { - error!("failed to bind to address {}: {}", conf.listen_pg_addr, e); + error!( + "failed to bind to address {}: {}", + listen_pg_addr_tenant_only, e + ); e })?; Some(listener) diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs index 5fe9db9628..f11075a736 100644 --- a/safekeeper/src/handler.rs +++ b/safekeeper/src/handler.rs @@ -11,6 +11,7 @@ use crate::auth::check_permission; use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage}; use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED}; +use crate::timeline::TimelineError; use crate::wal_service::ConnectionId; use crate::{GlobalTimelines, SafeKeeperConf}; use postgres_backend::QueryError; @@ -45,6 +46,7 @@ enum SafekeeperPostgresCommand { StartWalPush, StartReplication { start_lsn: Lsn }, IdentifySystem, + TimelineStatus, JSONCtrl { cmd: AppendLogicalMessage }, } @@ -64,6 +66,8 @@ fn parse_cmd(cmd: &str) -> anyhow::Result { Ok(SafekeeperPostgresCommand::StartReplication { start_lsn }) } else if cmd.starts_with("IDENTIFY_SYSTEM") { Ok(SafekeeperPostgresCommand::IdentifySystem) + } else if cmd.starts_with("TIMELINE_STATUS") { + Ok(SafekeeperPostgresCommand::TimelineStatus) } else if cmd.starts_with("JSON_CTRL") { let cmd = cmd.strip_prefix("JSON_CTRL").context("invalid prefix")?; Ok(SafekeeperPostgresCommand::JSONCtrl { @@ -78,6 +82,7 @@ fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str { match cmd { SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH", SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION", + SafekeeperPostgresCommand::TimelineStatus => "TIMELINE_STATUS", SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM", SafekeeperPostgresCommand::JSONCtrl { .. } => "JSON_CTRL", } @@ -219,6 +224,7 @@ impl postgres_backend::Handler .await } SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await, + SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await, SafekeeperPostgresCommand::JSONCtrl { ref cmd } => { handle_json_ctrl(self, pgb, cmd).await } @@ -263,6 +269,38 @@ impl SafekeeperPostgresHandler { check_permission(claims, tenant_id) } + async fn handle_timeline_status( + &mut self, + pgb: &mut PostgresBackend, + ) -> Result<(), QueryError> { + // Get timeline, handling "not found" error + let tli = match GlobalTimelines::get(self.ttid) { + Ok(tli) => Ok(Some(tli)), + Err(TimelineError::NotFound(_)) => Ok(None), + Err(e) => Err(QueryError::Other(e.into())), + }?; + + // Write row description + pgb.write_message_noflush(&BeMessage::RowDescription(&[ + RowDescriptor::text_col(b"flush_lsn"), + RowDescriptor::text_col(b"commit_lsn"), + ]))?; + + // Write row if timeline exists + if let Some(tli) = tli { + let (inmem, _state) = tli.get_state().await; + let flush_lsn = tli.get_flush_lsn().await; + let commit_lsn = inmem.commit_lsn; + pgb.write_message_noflush(&BeMessage::DataRow(&[ + Some(flush_lsn.to_string().as_bytes()), + Some(commit_lsn.to_string().as_bytes()), + ]))?; + } + + pgb.write_message_noflush(&BeMessage::CommandComplete(b"TIMELINE_STATUS"))?; + Ok(()) + } + /// /// Handle IDENTIFY_SYSTEM replication command /// diff --git a/scripts/combine_control_files.py b/scripts/combine_control_files.py new file mode 100644 index 0000000000..0350e4721d --- /dev/null +++ b/scripts/combine_control_files.py @@ -0,0 +1,80 @@ +#! /usr/bin/env python3 +# Script to generate ext_index.json metadata file +# that stores content of the control files and location of extension archives +# for all extensions in extensions subdir. +import argparse +import json +import subprocess +from pathlib import Path + +""" +# ext_index.json example: +{ + "public_extensions": [ + "anon" + ], + "library_index": { + "anon": "anon", + "kq_imcx": "kq_imcx" + // would be more complicated for something like postgis where multiple library names all map to postgis + }, + "extension_data": { + "kq_imcx": { + "control_data": { + "kq_imcx.control": "# This file is generated content from add_postgresql_extension.\n# No point in modifying it, it will be overwritten anyway.\n\n# Default version, always set\ndefault_version = '0.1'\n\n# Module pathname generated from target shared library name. Use\n# MODULE_PATHNAME in script file.\nmodule_pathname = '$libdir/kq_imcx.so'\n\n# Comment for extension. Set using COMMENT option. Can be set in\n# script file as well.\ncomment = 'ketteQ In-Memory Calendar Extension (IMCX)'\n\n# Encoding for script file. Set using ENCODING option.\n#encoding = ''\n\n# Required extensions. Set using REQUIRES option (multi-valued).\n#requires = ''\ntrusted = true\n" + }, + "archive_path": "5648391853/v15/extensions/kq_imcx.tar.zst" + }, + "anon": { + "control_data": { + "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n" + }, + "archive_path": "5648391853/v15/extensions/anon.tar.zst" + } + } +} +""" + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="generate ext_index.json") + parser.add_argument("pg_version", type=str, choices=["v14", "v15"], help="pg_version") + parser.add_argument("BUILD_TAG", type=str, help="BUILD_TAG for this compute image") + parser.add_argument("--public_extensions", type=str, help="list of public extensions") + args = parser.parse_args() + pg_version = args.pg_version + BUILD_TAG = args.BUILD_TAG + public_ext_list = args.public_extensions.split(",") + + ext_index = {} + library_index = {} + EXT_PATH = Path("extensions") + for extension in EXT_PATH.iterdir(): + if extension.is_dir(): + control_data = {} + for control_file in extension.glob("*.control"): + if control_file.suffix != ".control": + continue + with open(control_file, "r") as f: + control_data[control_file.name] = f.read() + ext_index[extension.name] = { + "control_data": control_data, + "archive_path": f"{BUILD_TAG}/{pg_version}/extensions/{extension.name}.tar.zst", + } + elif extension.suffix == ".zst": + file_list = ( + str(subprocess.check_output(["tar", "tf", str(extension)]), "utf-8") + .strip() + .split("\n") + ) + for file in file_list: + if file.endswith(".so") and file.startswith("lib/"): + lib_name = file[4:-3] + library_index[lib_name] = extension.name.replace(".tar.zst", "") + + all_data = { + "public_extensions": public_ext_list, + "library_index": library_index, + "extension_data": ext_index, + } + with open("ext_index.json", "w") as f: + json.dump(all_data, f) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index eafc061ab9..fb78d69d67 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -542,7 +542,7 @@ class S3Storage: access_key: str secret_key: str endpoint: Optional[str] = None - prefix_in_bucket: Optional[str] = None + prefix_in_bucket: Optional[str] = "" def access_env_vars(self) -> Dict[str, str]: return { @@ -1504,6 +1504,7 @@ class NeonCli(AbstractNeonCli): safekeepers: Optional[List[int]] = None, tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, + branch_name: Optional[str] = None, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1517,8 +1518,11 @@ class NeonCli(AbstractNeonCli): args.append(f"--lsn={lsn}") args.extend(["--pg-port", str(pg_port)]) args.extend(["--http-port", str(http_port)]) + if safekeepers is not None: args.extend(["--safekeepers", (",".join(map(str, safekeepers)))]) + if branch_name is not None: + args.extend(["--branch-name", branch_name]) if endpoint_id is not None: args.append(endpoint_id) diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index ad89ebad00..f8a4423ffa 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -194,14 +194,18 @@ def wait_for_upload_queue_empty( def wait_timeline_detail_404( - pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId + pageserver_http: PageserverHttpClient, + tenant_id: TenantId, + timeline_id: TimelineId, + wait_longer: bool = False, ): last_exc = None - for _ in range(2): + iterations = 10 if wait_longer else 2 + for _ in range(iterations): time.sleep(0.250) try: data = pageserver_http.timeline_detail(tenant_id, timeline_id) - log.error(f"detail {data}") + log.info(f"detail {data}") except PageserverApiException as e: log.debug(e) if e.status_code == 404: @@ -216,7 +220,8 @@ def timeline_delete_wait_completed( pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId, + wait_longer: bool = False, # Use when running with RemoteStorageKind.REAL_S3 **delete_args, ): pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args) - wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id) + wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, wait_longer) diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py index 875be3b7b0..fade78504a 100644 --- a/test_runner/performance/test_startup.py +++ b/test_runner/performance/test_startup.py @@ -61,6 +61,7 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc durations = { "wait_for_spec_ms": f"{i}_wait_for_spec", "sync_safekeepers_ms": f"{i}_sync_safekeepers", + "sync_sk_check_ms": f"{i}_sync_sk_check", "basebackup_ms": f"{i}_basebackup", "start_postgres_ms": f"{i}_start_postgres", "config_ms": f"{i}_config", diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index 4df5ae18d6..b6b46f9401 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -123,7 +123,7 @@ def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests): @pytest.mark.parametrize("content_type", [None, "application/json"]) def test_empty_body(positive_env: NeonEnv, content_type: Optional[str]): """ - For backwards-compatiblity: if we send an empty body, + For backwards-compatibility: if we send an empty body, the request should be accepted and the config should be the default config. """ env = positive_env diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index a3d02c3f5a..754d23a01a 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -4,7 +4,7 @@ import shutil import subprocess import tempfile from pathlib import Path -from typing import Any, Optional +from typing import Any, List, Optional import pytest import toml # TODO: replace with tomllib for Python >= 3.11 @@ -14,7 +14,6 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, PortDistributor, - parse_project_git_version_output, ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( @@ -63,7 +62,6 @@ def test_create_snapshot( neon_env_builder.pg_version = pg_version neon_env_builder.num_safekeepers = 3 neon_env_builder.enable_local_fs_remote_storage() - neon_env_builder.preserve_database_files = True env = neon_env_builder.init_start() endpoint = env.endpoints.create_start("main") @@ -259,36 +257,15 @@ def prepare_snapshot( shutil.rmtree(repo_dir / "pgdatadirs") os.mkdir(repo_dir / "endpoints") - # Remove wal-redo temp directory if it exists. Newer pageserver versions don't create - # them anymore, but old versions did. - for tenant in (repo_dir / "tenants").glob("*"): - wal_redo_dir = tenant / "wal-redo-datadir.___temp" - if wal_redo_dir.exists() and wal_redo_dir.is_dir(): - shutil.rmtree(wal_redo_dir) - # Update paths and ports in config files pageserver_toml = repo_dir / "pageserver.toml" pageserver_config = toml.load(pageserver_toml) pageserver_config["remote_storage"]["local_path"] = str(repo_dir / "local_fs_remote_storage") - pageserver_config["listen_http_addr"] = port_distributor.replace_with_new_port( - pageserver_config["listen_http_addr"] - ) - pageserver_config["listen_pg_addr"] = port_distributor.replace_with_new_port( - pageserver_config["listen_pg_addr"] - ) - # since storage_broker these are overriden by neon_local during pageserver - # start; remove both to prevent unknown options during etcd -> - # storage_broker migration. TODO: remove once broker is released - pageserver_config.pop("broker_endpoint", None) - pageserver_config.pop("broker_endpoints", None) - etcd_broker_endpoints = [f"http://localhost:{port_distributor.get_port()}/"] - if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0": - pageserver_config["broker_endpoints"] = etcd_broker_endpoints # old etcd version + for param in ("listen_http_addr", "listen_pg_addr", "broker_endpoint"): + pageserver_config[param] = port_distributor.replace_with_new_port(pageserver_config[param]) - # Older pageserver versions had just one `auth_type` setting. Now there - # are separate settings for pg and http ports. We don't use authentication - # in compatibility tests so just remove authentication related settings. - pageserver_config.pop("auth_type", None) + # We don't use authentication in compatibility tests + # so just remove authentication related settings. pageserver_config.pop("pg_auth_type", None) pageserver_config.pop("http_auth_type", None) @@ -300,31 +277,16 @@ def prepare_snapshot( snapshot_config_toml = repo_dir / "config" snapshot_config = toml.load(snapshot_config_toml) - - # Provide up/downgrade etcd <-> storage_broker to make forward/backward - # compatibility test happy. TODO: leave only the new part once broker is released. - if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0": - # old etcd version - snapshot_config["etcd_broker"] = { - "etcd_binary_path": shutil.which("etcd"), - "broker_endpoints": etcd_broker_endpoints, - } - snapshot_config.pop("broker", None) - else: - # new storage_broker version - broker_listen_addr = f"127.0.0.1:{port_distributor.get_port()}" - snapshot_config["broker"] = {"listen_addr": broker_listen_addr} - snapshot_config.pop("etcd_broker", None) - - snapshot_config["pageserver"]["listen_http_addr"] = port_distributor.replace_with_new_port( - snapshot_config["pageserver"]["listen_http_addr"] - ) - snapshot_config["pageserver"]["listen_pg_addr"] = port_distributor.replace_with_new_port( - snapshot_config["pageserver"]["listen_pg_addr"] + for param in ("listen_http_addr", "listen_pg_addr"): + snapshot_config["pageserver"][param] = port_distributor.replace_with_new_port( + snapshot_config["pageserver"][param] + ) + snapshot_config["broker"]["listen_addr"] = port_distributor.replace_with_new_port( + snapshot_config["broker"]["listen_addr"] ) for sk in snapshot_config["safekeepers"]: - sk["http_port"] = port_distributor.replace_with_new_port(sk["http_port"]) - sk["pg_port"] = port_distributor.replace_with_new_port(sk["pg_port"]) + for param in ("http_port", "pg_port", "pg_tenant_only_port"): + sk[param] = port_distributor.replace_with_new_port(sk[param]) if pg_distrib_dir: snapshot_config["pg_distrib_dir"] = str(pg_distrib_dir) @@ -350,12 +312,6 @@ def prepare_snapshot( ), f"there're files referencing `test_create_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}" -# get git SHA of neon binary -def get_neon_version(neon_binpath: Path): - out = subprocess.check_output([neon_binpath / "neon_local", "--version"]).decode("utf-8") - return parse_project_git_version_output(out) - - def check_neon_works( repo_dir: Path, neon_target_binpath: Path, @@ -381,7 +337,6 @@ def check_neon_works( config.pg_version = pg_version config.initial_tenant = snapshot_config["default_tenant_id"] config.pg_distrib_dir = pg_distrib_dir - config.preserve_database_files = True # Use the "target" binaries to launch the storage nodes config_target = config @@ -438,6 +393,14 @@ def check_neon_works( test_output_dir / "dump-from-wal.filediff", ) + # TODO: Run pg_amcheck unconditionally after the next release + try: + pg_bin.run(["psql", connstr, "--command", "CREATE EXTENSION IF NOT EXISTS amcheck"]) + except subprocess.CalledProcessError: + log.info("Extension amcheck is not available, skipping pg_amcheck") + else: + pg_bin.run_capture(["pg_amcheck", connstr, "--install-missing", "--verbose"]) + # Check that we can interract with the data pg_bin.run_capture(["pgbench", "--time=10", "--progress=2", connstr]) @@ -445,10 +408,15 @@ def check_neon_works( assert not initial_dump_differs, "initial dump differs" -def dump_differs(first: Path, second: Path, output: Path) -> bool: +def dump_differs( + first: Path, second: Path, output: Path, allowed_diffs: Optional[List[str]] = None +) -> bool: """ Runs diff(1) command on two SQL dumps and write the output to the given output file. - Returns True if the dumps differ, False otherwise. + The function supports allowed diffs, if the diff is in the allowed_diffs list, it's not considered as a difference. + See the example of it in https://github.com/neondatabase/neon/pull/4425/files#diff-15c5bfdd1d5cc1411b9221091511a60dd13a9edf672bdfbb57dd2ef8bb7815d6 + + Returns True if the dumps differ and produced diff is not allowed, False otherwise (in most cases we want it to return False). """ with output.open("w") as stdout: @@ -466,51 +434,30 @@ def dump_differs(first: Path, second: Path, output: Path) -> bool: differs = res.returncode != 0 - # TODO: Remove after https://github.com/neondatabase/neon/pull/4425 is merged, and a couple of releases are made - if differs: - with tempfile.NamedTemporaryFile(mode="w") as tmp: - tmp.write(PR4425_ALLOWED_DIFF) - tmp.flush() + allowed_diffs = allowed_diffs or [] + if differs and len(allowed_diffs) > 0: + for allowed_diff in allowed_diffs: + with tempfile.NamedTemporaryFile(mode="w") as tmp: + tmp.write(allowed_diff) + tmp.flush() - allowed = subprocess.run( - [ - "diff", - "--unified", # Make diff output more readable - r"--ignore-matching-lines=^---", # Ignore diff headers - r"--ignore-matching-lines=^\+\+\+", # Ignore diff headers - "--ignore-matching-lines=^@@", # Ignore diff blocks location - "--ignore-matching-lines=^ *$", # Ignore lines with only spaces - "--ignore-matching-lines=^ --.*", # Ignore the " --" lines for compatibility with PG14 - "--ignore-blank-lines", - str(output), - str(tmp.name), - ], - ) + allowed = subprocess.run( + [ + "diff", + "--unified", # Make diff output more readable + r"--ignore-matching-lines=^---", # Ignore diff headers + r"--ignore-matching-lines=^\+\+\+", # Ignore diff headers + "--ignore-matching-lines=^@@", # Ignore diff blocks location + "--ignore-matching-lines=^ *$", # Ignore lines with only spaces + "--ignore-matching-lines=^ --.*", # Ignore SQL comments in diff + "--ignore-blank-lines", + str(output), + str(tmp.name), + ], + ) - differs = allowed.returncode != 0 + differs = allowed.returncode != 0 + if not differs: + break return differs - - -PR4425_ALLOWED_DIFF = """ ---- /tmp/test_output/test_backward_compatibility[release-pg15]/compatibility_snapshot/dump.sql 2023-06-08 18:12:45.000000000 +0000 -+++ /tmp/test_output/test_backward_compatibility[release-pg15]/dump.sql 2023-06-13 07:25:35.211733653 +0000 -@@ -13,12 +13,20 @@ - - CREATE ROLE cloud_admin; - ALTER ROLE cloud_admin WITH SUPERUSER INHERIT CREATEROLE CREATEDB LOGIN REPLICATION BYPASSRLS; -+CREATE ROLE neon_superuser; -+ALTER ROLE neon_superuser WITH NOSUPERUSER INHERIT CREATEROLE CREATEDB NOLOGIN NOREPLICATION NOBYPASSRLS; - - -- - -- User Configurations - -- - - -+-- -+-- Role memberships -+-- -+ -+GRANT pg_read_all_data TO neon_superuser GRANTED BY cloud_admin; -+GRANT pg_write_all_data TO neon_superuser GRANTED BY cloud_admin; -""" diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py index 6e2a0622f1..f58abb4575 100644 --- a/test_runner/regress/test_gc_cutoff.py +++ b/test_runner/regress/test_gc_cutoff.py @@ -14,10 +14,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): env = neon_env_builder.init_start() - # These warnings are expected, when the pageserver is restarted abruptly - env.pageserver.allowed_errors.append(".*found future image layer.*") - env.pageserver.allowed_errors.append(".*found future delta layer.*") - pageserver_http = env.pageserver.http_client() # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py index 3314e7fbf6..6dd47de8cf 100644 --- a/test_runner/regress/test_neon_local_cli.py +++ b/test_runner/regress/test_neon_local_cli.py @@ -16,11 +16,13 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por endpoint_id="ep-basic-main", pg_port=pg_port, http_port=http_port ) - env.neon_cli.create_branch(new_branch_name="migration_check") + branch_name = "migration-check" + + env.neon_cli.create_branch(new_branch_name=branch_name) pg_port = port_distributor.get_port() http_port = port_distributor.get_port() env.neon_cli.endpoint_start( - endpoint_id="ep-migration_check", pg_port=pg_port, http_port=http_port + f"ep-{branch_name}", pg_port, http_port, branch_name=branch_name ) finally: env.neon_cli.stop() diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index b92981a371..ca43df350d 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -690,10 +690,6 @@ def test_ondemand_download_failure_to_replace( pageserver_http = env.pageserver.http_client() - lsn = Lsn(pageserver_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"]) - - wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn) - # remove layers so that they will be redownloaded pageserver_http.tenant_detach(tenant_id) pageserver_http.tenant_attach(tenant_id) @@ -704,8 +700,10 @@ def test_ondemand_download_failure_to_replace( # requesting details with non-incremental size should trigger a download of the only layer # this will need to be adjusted if an index for logical sizes is ever implemented with pytest.raises(PageserverApiException): - # error message is not useful - pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2) + # PageserverApiException is expected because of the failpoint (timeline_detail building does something) + # ReadTimeout can happen on our busy CI, but it should not, because there is no more busylooping + # but should it be added back, we would wait for 15s here. + pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=15) actual_message = ".* ERROR .*layermap-replace-notfound" assert env.pageserver.log_contains(actual_message) is not None diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index 6da5503fb1..b8ddbe3ec1 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -72,10 +72,6 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder): def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - # These warnings are expected, when the pageserver is restarted abruptly - env.pageserver.allowed_errors.append(".*found future image layer.*") - env.pageserver.allowed_errors.append(".*found future delta layer.*") - # Use a tiny checkpoint distance, to create a lot of layers quickly. # That allows us to stress the compaction and layer flushing logic more. tenant, _ = env.neon_cli.create_tenant( diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py index 1f6dcd39e9..35334ec7b2 100644 --- a/test_runner/regress/test_proxy.py +++ b/test_runner/regress/test_proxy.py @@ -1,6 +1,6 @@ import json import subprocess -from typing import Any, List, Optional +from typing import Any, List, Optional, Tuple import psycopg2 import pytest @@ -260,3 +260,73 @@ def test_sql_over_http_output_options(static_proxy: NeonProxy): rows = q("select 1 as n, 'a' as s, '{1,2,3}'::int4[] as arr", True, True)["rows"] assert rows == [["1", "a", "{1,2,3}"]] + + +def test_sql_over_http_batch(static_proxy: NeonProxy): + static_proxy.safe_psql("create role http with login password 'http' superuser") + + def qq(queries: List[Tuple[str, Optional[List[Any]]]], read_only: bool = False) -> Any: + connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres" + response = requests.post( + f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql", + data=json.dumps(list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))), + headers={ + "Content-Type": "application/sql", + "Neon-Connection-String": connstr, + "Neon-Batch-Isolation-Level": "Serializable", + "Neon-Batch-Read-Only": "true" if read_only else "false", + }, + verify=str(static_proxy.test_output_dir / "proxy.crt"), + ) + assert response.status_code == 200 + return response.json()["results"], response.headers + + result, headers = qq( + [ + ("select 42 as answer", None), + ("select $1 as answer", [42]), + ("select $1 * 1 as answer", [42]), + ("select $1::int[] as answer", [[1, 2, 3]]), + ("select $1::json->'a' as answer", [{"a": {"b": 42}}]), + ("select * from pg_class limit 1", None), + ("create table t(id serial primary key, val int)", None), + ("insert into t(val) values (10), (20), (30) returning id", None), + ("select * from t", None), + ("drop table t", None), + ] + ) + + assert headers["Neon-Batch-Isolation-Level"] == "Serializable" + assert headers["Neon-Batch-Read-Only"] == "false" + + assert result[0]["rows"] == [{"answer": 42}] + assert result[1]["rows"] == [{"answer": "42"}] + assert result[2]["rows"] == [{"answer": 42}] + assert result[3]["rows"] == [{"answer": [1, 2, 3]}] + assert result[4]["rows"] == [{"answer": {"b": 42}}] + assert len(result[5]["rows"]) == 1 + res = result[6] + assert res["command"] == "CREATE" + assert res["rowCount"] is None + res = result[7] + assert res["command"] == "INSERT" + assert res["rowCount"] == 3 + assert res["rows"] == [{"id": 1}, {"id": 2}, {"id": 3}] + res = result[8] + assert res["command"] == "SELECT" + assert res["rowCount"] == 3 + res = result[9] + assert res["command"] == "DROP" + assert res["rowCount"] is None + assert len(result) == 10 + + result, headers = qq( + [ + ("select 42 as answer", None), + ], + True, + ) + assert headers["Neon-Batch-Isolation-Level"] == "Serializable" + assert headers["Neon-Batch-Read-Only"] == "true" + + assert result[0]["rows"] == [{"answer": 42}] diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index 552825cf08..9d7a4a8fd6 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -15,10 +15,6 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.pageserver.is_testing_enabled_or_skip() - # These warnings are expected, when the pageserver is restarted abruptly - env.pageserver.allowed_errors.append(".*found future delta layer.*") - env.pageserver.allowed_errors.append(".*found future image layer.*") - # Create a branch for us env.neon_cli.create_branch("test_pageserver_recovery", "main") diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index c7083d92be..4d8c87e09e 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -38,6 +38,12 @@ def test_threshold_based_eviction( env = neon_env_builder.init_start() env.pageserver.allowed_errors.append(metrics_refused_log_line) + # these can happen whenever we run consumption metrics collection + env.pageserver.allowed_errors.append(r".*failed to calculate logical size at \S+: cancelled") + env.pageserver.allowed_errors.append( + r".*failed to calculate synthetic size for tenant \S+: failed to calculate some logical_sizes" + ) + tenant_id, timeline_id = env.initial_tenant, env.initial_timeline assert isinstance(timeline_id, TimelineId) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index a4c5bf626a..9226ca21d2 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -1,3 +1,4 @@ +import enum import os import queue import shutil @@ -11,9 +12,12 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + PgBin, RemoteStorageKind, S3Storage, available_remote_storages, + last_flush_lsn_upload, + wait_for_last_flush_lsn, ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( @@ -117,59 +121,183 @@ def test_timeline_delete(neon_simple_env: NeonEnv): ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) +class Check(enum.Enum): + RETRY_WITHOUT_RESTART = enum.auto() + RETRY_WITH_RESTART = enum.auto() + + +DELETE_FAILPOINTS = [ + "timeline-delete-before-index-deleted-at", + "timeline-delete-before-schedule", + "timeline-delete-before-rm", + "timeline-delete-during-rm", + "timeline-delete-after-rm", + "timeline-delete-before-index-delete", + "timeline-delete-after-index-delete", + "timeline-delete-after-rm-metadata", + "timeline-delete-after-rm-dir", +] + + +def combinations(): + result = [] + + remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3] + if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"): + remotes.append(RemoteStorageKind.REAL_S3) + + for remote_storage_kind in remotes: + for delete_failpoint in DELETE_FAILPOINTS: + if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in ( + "timeline-delete-before-index-delete", + "timeline-delete-after-index-delete", + ): + # the above failpoints are not relevant for config without remote storage + continue + + result.append((remote_storage_kind, delete_failpoint)) + return result + + # cover the two cases: remote storage configured vs not configured -@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS]) -def test_delete_timeline_post_rm_failure( - neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +@pytest.mark.parametrize("remote_storage_kind, failpoint", combinations()) +@pytest.mark.parametrize("check", list(Check)) +def test_delete_timeline_exercise_crash_safety_failpoints( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, + failpoint: str, + check: Check, + pg_bin: PgBin, ): """ - If there is a failure after removing the timeline directory, the delete operation - should be retryable. + If there is a failure during deletion in one of the associated failpoints (or crash restart happens at this point) the delete operation + should be retryable and should be successfully resumed. + + We iterate over failpoints list, changing failpoint to the next one. + + 1. Set settings to generate many layers + 2. Create branch. + 3. Insert something + 4. Go with the test. + 5. Iterate over failpoints + 6. Execute delete for each failpoint + 7. Ensure failpoint is hit + 8. Retry or restart without the failpoint and check the result. """ if remote_storage_kind is not None: neon_env_builder.enable_remote_storage( - remote_storage_kind, "test_delete_timeline_post_rm_failure" + remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints" ) - env = neon_env_builder.init_start() - assert env.initial_timeline - - env.pageserver.allowed_errors.append(".*Error: failpoint: timeline-delete-after-rm") - env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline") + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "0s", + "compaction_period": "0s", + "checkpoint_distance": f"{1024 ** 2}", + "image_creation_threshold": "100", + } + ) ps_http = env.pageserver.http_client() - failpoint_name = "timeline-delete-after-rm" - ps_http.configure_failpoints((failpoint_name, "return")) + timeline_id = env.neon_cli.create_timeline("delete") + with env.endpoints.create_start("delete") as endpoint: + # generate enough layers + pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", endpoint.connstr()]) + if remote_storage_kind is RemoteStorageKind.NOOP: + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline_id) + else: + last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id) - ps_http.timeline_delete(env.initial_tenant, env.initial_timeline) - wait_until_timeline_state( - pageserver_http=ps_http, - tenant_id=env.initial_tenant, - timeline_id=env.initial_timeline, - expected_state="Broken", - iterations=2, # effectively try immediately and retry once in one second - ) - - # FIXME: #4719 - # timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm" - - at_failpoint_log_message = f".*{env.initial_timeline}.*at failpoint {failpoint_name}.*" - env.pageserver.allowed_errors.append(at_failpoint_log_message) + env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}") + # It appears when we stopped flush loop during deletion and then pageserver is stopped env.pageserver.allowed_errors.append( - f".*DELETE.*{env.initial_timeline}.*InternalServerError.*{failpoint_name}" + ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited" ) - - # retry without failpoint, it should succeed - ps_http.configure_failpoints((failpoint_name, "off")) - - # this should succeed - # this also checks that delete can be retried even when timeline is in Broken state - timeline_delete_wait_completed(ps_http, env.initial_tenant, env.initial_timeline) + # This happens when we fail before scheduling background operation. + # Timeline is left in stopping state and retry tries to stop it again. env.pageserver.allowed_errors.append( - f".*{env.initial_timeline}.*timeline directory not found, proceeding anyway.*" + ".*Ignoring new state, equal to the existing one: Stopping" ) + # This happens when we retry delete requests for broken timelines + env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline") + # This happens when timeline remains are cleaned up during loading + env.pageserver.allowed_errors.append(".*Timeline dir entry become invalid.*") + # In one of the branches we poll for tenant to become active. Polls can generate this log message: + env.pageserver.allowed_errors.append(f".*Tenant {env.initial_tenant} is not active*") + + ps_http.configure_failpoints((failpoint, "return")) + + # These failpoints are earlier than background task is spawned. + # so they result in api request failure. + if failpoint in ( + "timeline-delete-before-index-deleted-at", + "timeline-delete-before-schedule", + ): + with pytest.raises(PageserverApiException, match=failpoint): + ps_http.timeline_delete(env.initial_tenant, timeline_id) + + else: + ps_http.timeline_delete(env.initial_tenant, timeline_id) + timeline_info = wait_until_timeline_state( + pageserver_http=ps_http, + tenant_id=env.initial_tenant, + timeline_id=timeline_id, + expected_state="Broken", + iterations=2, # effectively try immediately and retry once in one second + ) + + reason = timeline_info["state"]["Broken"]["reason"] + log.info(f"timeline broken: {reason}") + + # failpoint may not be the only error in the stack + assert reason.endswith(f"failpoint: {failpoint}"), reason + + wait_longer = remote_storage_kind is RemoteStorageKind.REAL_S3 + if check is Check.RETRY_WITH_RESTART: + env.pageserver.stop() + env.pageserver.start() + if failpoint == "timeline-delete-before-index-deleted-at": + # We crashed before persisting this to remote storage, need to retry delete request + + # Wait till tenant is loaded. Shouldnt take longer than 2 seconds (we shouldnt block tenant loading) + wait_until_tenant_active(ps_http, env.initial_tenant, iterations=2) + + timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id) + else: + # Pageserver should've resumed deletion after restart. + wait_timeline_detail_404( + ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer + ) + elif check is Check.RETRY_WITHOUT_RESTART: + # this should succeed + # this also checks that delete can be retried even when timeline is in Broken state + ps_http.configure_failpoints((failpoint, "off")) + + timeline_delete_wait_completed( + ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer + ) + + # Check remote is impty + if remote_storage_kind is RemoteStorageKind.MOCK_S3: + assert_prefix_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(env.initial_tenant), + "timelines", + str(timeline_id), + ) + ), + ) + + timeline_dir = env.timeline_dir(env.initial_tenant, timeline_id) + # Check local is empty + assert not timeline_dir.exists() + # Check no delete mark present + assert not (timeline_dir.parent / f"{timeline_id}.___deleted").exists() @pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) @@ -327,7 +455,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild ) ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id) - wait_until_timeline_state( + timeline_info = wait_until_timeline_state( pageserver_http=ps_http, tenant_id=env.initial_tenant, timeline_id=leaf_timeline_id, @@ -335,8 +463,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild iterations=2, # effectively try immediately and retry once in one second ) - # FIXME: #4719 - # timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm" + assert timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-before-rm" assert leaf_timeline_path.exists(), "the failpoint didn't work" @@ -588,6 +715,7 @@ def test_timeline_delete_works_for_remote_smoke( assert tenant_id == env.initial_tenant assert main_timeline_id == env.initial_timeline + assert env.initial_timeline is not None timeline_ids = [env.initial_timeline] for i in range(2): branch_timeline_id = env.neon_cli.create_branch(f"new{i}", "main") diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 12c5dc8281..ebedb34d01 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 12c5dc8281d20b5bd636e1097eea80a7bc609591 +Subproject commit ebedb34d01c8ac9c31e8ea4628b9854103a1dc8f diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index e3fbfc4d14..1220c8a63f 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit e3fbfc4d143b2d3c3c1813ce747f8af35aa9405e +Subproject commit 1220c8a63f00101829f9222a5821fc084b4384c7 diff --git a/vendor/revisions.json b/vendor/revisions.json index 18da5900a8..f5d7428cd9 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,4 +1,4 @@ { - "postgres-v15": "e3fbfc4d143b2d3c3c1813ce747f8af35aa9405e", - "postgres-v14": "12c5dc8281d20b5bd636e1097eea80a7bc609591" + "postgres-v15": "1220c8a63f00101829f9222a5821fc084b4384c7", + "postgres-v14": "ebedb34d01c8ac9c31e8ea4628b9854103a1dc8f" }