diff --git a/.dockerignore b/.dockerignore index 1258532db8..c7a2f78e32 100644 --- a/.dockerignore +++ b/.dockerignore @@ -8,6 +8,7 @@ !scripts/combine_control_files.py !scripts/ninstall.sh !vm-cgconfig.conf +!docker-compose/run-tests.sh # Directories !.cargo/ @@ -20,7 +21,7 @@ !patches/ !pgxn/ !proxy/ -!s3_scrubber/ +!storage_scrubber/ !safekeeper/ !storage_broker/ !storage_controller/ diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 1fc0fbb0b6..79a0a77638 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -299,21 +299,21 @@ jobs: uses: actions/cache@v4 with: path: pg_install/v14 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}-${{ hashFiles('Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v15 build id: cache_pg_15 uses: actions/cache@v4 with: path: pg_install/v15 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}-${{ hashFiles('Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Cache postgres v16 build id: cache_pg_16 uses: actions/cache@v4 with: path: pg_install/v16 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}-${{ hashFiles('Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }} - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' @@ -858,6 +858,26 @@ jobs: cache-to: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max tags: | neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }} + + - name: Build neon extensions test image + if: matrix.version == 'v16' + uses: docker/build-push-action@v5 + with: + context: . + build-args: | + GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }} + PG_VERSION=${{ matrix.version }} + BUILD_TAG=${{ needs.tag.outputs.build-tag }} + TAG=${{ needs.build-build-tools-image.outputs.image-tag }} + provenance: false + push: true + pull: true + file: Dockerfile.compute-node + target: neon-pg-ext-test + cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }} + cache-to: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }},mode=max + tags: | + neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }} - name: Build compute-tools image # compute-tools are Postgres independent, so build it only once @@ -902,6 +922,13 @@ jobs: neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \ neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 + - name: Create multi-arch neon-test-extensions image + if: matrix.version == 'v16' + run: | + docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }} \ + neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-x64 \ + neondatabase/neon-test-extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-arm64 + - name: Create multi-arch compute-tools image if: matrix.version == 'v16' run: | @@ -1020,7 +1047,7 @@ jobs: exit 1 fi - - name: Verify docker-compose example + - name: Verify docker-compose example and test extensions timeout-minutes: 20 run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh diff --git a/Cargo.lock b/Cargo.lock index dbbf330cf9..1c8a8b0c0f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5109,54 +5109,6 @@ version = "1.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" -[[package]] -name = "s3_scrubber" -version = "0.1.0" -dependencies = [ - "anyhow", - "async-stream", - "aws-config", - "aws-sdk-s3", - "aws-smithy-async", - "bincode", - "bytes", - "camino", - "chrono", - "clap", - "crc32c", - "either", - "futures", - "futures-util", - "hex", - "histogram", - "humantime", - "itertools", - "once_cell", - "pageserver", - "pageserver_api", - "postgres_ffi", - "rand 0.8.5", - "remote_storage", - "reqwest 0.12.4", - "rustls 0.22.4", - "rustls-native-certs 0.7.0", - "serde", - "serde_json", - "serde_with", - "thiserror", - "tokio", - "tokio-postgres", - "tokio-postgres-rustls", - "tokio-rustls 0.25.0", - "tokio-stream", - "tokio-util", - "tracing", - "tracing-appender", - "tracing-subscriber", - "utils", - "workspace_hack", -] - [[package]] name = "safekeeper" version = "0.1.0" @@ -5813,6 +5765,54 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "storage_scrubber" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-stream", + "aws-config", + "aws-sdk-s3", + "aws-smithy-async", + "bincode", + "bytes", + "camino", + "chrono", + "clap", + "crc32c", + "either", + "futures", + "futures-util", + "hex", + "histogram", + "humantime", + "itertools", + "once_cell", + "pageserver", + "pageserver_api", + "postgres_ffi", + "rand 0.8.5", + "remote_storage", + "reqwest 0.12.4", + "rustls 0.22.4", + "rustls-native-certs 0.7.0", + "serde", + "serde_json", + "serde_with", + "thiserror", + "tokio", + "tokio-postgres", + "tokio-postgres-rustls", + "tokio-rustls 0.25.0", + "tokio-stream", + "tokio-util", + "tracing", + "tracing-appender", + "tracing-subscriber", + "utils", + "workspace_hack", +] + [[package]] name = "storcon_cli" version = "0.1.0" @@ -5820,6 +5820,7 @@ dependencies = [ "anyhow", "clap", "comfy-table", + "futures", "humantime", "hyper 0.14.26", "pageserver_api", diff --git a/Cargo.toml b/Cargo.toml index 58715db32b..dc89c2341b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ members = [ "safekeeper", "storage_broker", "storage_controller", - "s3_scrubber", + "storage_scrubber", "workspace_hack", "trace", "libs/compute_api", diff --git a/Dockerfile b/Dockerfile index b4900d4a94..5f82df3e18 100644 --- a/Dockerfile +++ b/Dockerfile @@ -69,6 +69,8 @@ RUN set -e \ && apt install -y \ libreadline-dev \ libseccomp-dev \ + libicu67 \ + openssl \ ca-certificates \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \ && useradd -d /data neon \ diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index 91194eda1a..460b8c996d 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -112,35 +112,6 @@ RUN for package in Capture::Tiny DateTime Devel::Cover Digest::MD5 File::Spec JS && make install \ && rm -rf ../lcov.tar.gz -# Compile and install the static OpenSSL library -ENV OPENSSL_VERSION=3.2.2 -ENV OPENSSL_PREFIX=/usr/local/openssl -RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz && \ - echo "197149c18d9e9f292c43f0400acaba12e5f52cacfe050f3d199277ea738ec2e7 /tmp/openssl-${OPENSSL_VERSION}.tar.gz" | sha256sum --check && \ - cd /tmp && \ - tar xzvf /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \ - rm /tmp/openssl-${OPENSSL_VERSION}.tar.gz && \ - cd /tmp/openssl-${OPENSSL_VERSION} && \ - ./config --prefix=${OPENSSL_PREFIX} -static --static no-shared -fPIC && \ - make ${MAKE_ARGS} && \ - make install && \ - cd /tmp && \ - rm -rf /tmp/openssl-${OPENSSL_VERSION} - -# Set the ICU version -ENV ICU_VERSION=72.1 -ENV ICU_PREFIX=/usr/local/icu - -# Download and build static ICU -RUN wget https://github.com/unicode-org/icu/releases/download/release-${ICU_VERSION//./-}/icu4c-${ICU_VERSION//./_}-src.tgz && \ - tar -xzf icu4c-${ICU_VERSION//./_}-src.tgz && \ - cd icu/source && \ - ./configure --prefix=${ICU_PREFIX} --enable-static --enable-shared=no CXXFLAGS="-fPIC" CFLAGS="-fPIC" && \ - make && \ - make install && \ - cd ../.. && \ - rm -rf icu icu4c-${ICU_VERSION//./_}-src.tgz - # Switch to nonroot user USER nonroot:nonroot WORKDIR /home/nonroot @@ -199,6 +170,3 @@ RUN whoami \ && rustup --version --verbose \ && rustc --version --verbose \ && clang --version - -# Set following flag to check in Makefile if its running in Docker -RUN touch /home/nonroot/.docker_build diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index 90b8868b43..a86fdd0bc3 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -928,6 +928,69 @@ RUN rm -r /usr/local/pgsql/include # if they were to be used by other libraries. RUN rm /usr/local/pgsql/lib/lib*.a + +######################################################################################### +# +# Layer neon-pg-ext-test +# +######################################################################################### + +FROM neon-pg-ext-build AS neon-pg-ext-test +ARG PG_VERSION +RUN mkdir /ext-src + +#COPY --from=postgis-build /postgis.tar.gz /ext-src/ +#COPY --from=postgis-build /sfcgal/* /usr +COPY --from=plv8-build /plv8.tar.gz /ext-src/ +COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/ +COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/ +COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/ +COPY --from=vector-pg-build /pgvector.patch /ext-src/ +COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src +#COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src +#COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src +#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src +COPY --from=hypopg-pg-build /hypopg.tar.gz /ext-src +COPY --from=pg-hashids-pg-build /pg_hashids.tar.gz /ext-src +#COPY --from=rum-pg-build /rum.tar.gz /ext-src +#COPY --from=pgtap-pg-build /pgtap.tar.gz /ext-src +COPY --from=ip4r-pg-build /ip4r.tar.gz /ext-src +COPY --from=prefix-pg-build /prefix.tar.gz /ext-src +COPY --from=hll-pg-build /hll.tar.gz /ext-src +COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src +#COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src +COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src +COPY patches/pg_hintplan.patch /ext-src +#COPY --from=kq-imcx-pg-build /kq_imcx.tar.gz /ext-src +COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src +COPY patches/pg_cron.patch /ext-src +#COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src +COPY --from=rdkit-pg-build /rdkit.tar.gz /ext-src +COPY --from=pg-uuidv7-pg-build /pg_uuidv7.tar.gz /ext-src +COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src +COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src +#COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src +#COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src +COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src +COPY patches/pg_anon.patch /ext-src +COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src +COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src +RUN cd /ext-src/ && for f in *.tar.gz; \ + do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \ + rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \ + || exit 1; rm -f $f; done +RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch +# cmake is required for the h3 test +RUN apt-get update && apt-get install -y cmake +RUN patch -p1 < /ext-src/pg_hintplan.patch +COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh +RUN patch -p1 , + // Optional: migration concurrency (default is 8) + #[arg(long)] + concurrency: Option, + // Optional: maximum number of shards to migrate + #[arg(long)] + max_shards: Option, + // Optional: when set to true, nothing is migrated, but the plan is printed to stdout + #[arg(long)] + dry_run: Option, + }, } #[derive(Parser)] @@ -737,6 +754,197 @@ async fn main() -> anyhow::Result<()> { }) .await?; } + Command::Drain { + nodes, + concurrency, + max_shards, + dry_run, + } => { + // Load the list of nodes, split them up into the drained and filled sets, + // and validate that draining is possible. + let node_descs = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/node".to_string(), + None, + ) + .await?; + + let mut node_to_drain_descs = Vec::new(); + let mut node_to_fill_descs = Vec::new(); + + for desc in node_descs { + let to_drain = nodes.iter().any(|id| *id == desc.id); + if to_drain { + node_to_drain_descs.push(desc); + } else { + node_to_fill_descs.push(desc); + } + } + + if nodes.len() != node_to_drain_descs.len() { + anyhow::bail!("Drain requested for node which doesn't exist.") + } + + let can_fill = node_to_fill_descs + .iter() + .filter(|desc| { + matches!(desc.availability, NodeAvailabilityWrapper::Active) + && matches!( + desc.scheduling, + NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Filling + ) + }) + .any(|_| true); + + if !can_fill { + anyhow::bail!("There are no nodes to drain to") + } + + // Set the node scheduling policy to draining for the nodes which + // we plan to drain. + for node_desc in node_to_drain_descs.iter() { + let req = NodeConfigureRequest { + node_id: node_desc.id, + availability: None, + scheduling: Some(NodeSchedulingPolicy::Draining), + }; + + storcon_client + .dispatch::<_, ()>( + Method::PUT, + format!("control/v1/node/{}/config", node_desc.id), + Some(req), + ) + .await?; + } + + // Perform the drain: move each tenant shard scheduled on a node to + // be drained to a node which is being filled. A simple round robin + // strategy is used to pick the new node. + let tenants = storcon_client + .dispatch::<(), Vec>( + Method::GET, + "control/v1/tenant".to_string(), + None, + ) + .await?; + + let mut selected_node_idx = 0; + + struct DrainMove { + tenant_shard_id: TenantShardId, + from: NodeId, + to: NodeId, + } + + let mut moves: Vec = Vec::new(); + + let shards = tenants + .into_iter() + .flat_map(|tenant| tenant.shards.into_iter()); + for shard in shards { + if let Some(max_shards) = max_shards { + if moves.len() >= max_shards { + println!( + "Stop planning shard moves since the requested maximum was reached" + ); + break; + } + } + + let should_migrate = { + if let Some(attached_to) = shard.node_attached { + node_to_drain_descs + .iter() + .map(|desc| desc.id) + .any(|id| id == attached_to) + } else { + false + } + }; + + if !should_migrate { + continue; + } + + moves.push(DrainMove { + tenant_shard_id: shard.tenant_shard_id, + from: shard + .node_attached + .expect("We only migrate attached tenant shards"), + to: node_to_fill_descs[selected_node_idx].id, + }); + selected_node_idx = (selected_node_idx + 1) % node_to_fill_descs.len(); + } + + let total_moves = moves.len(); + + if dry_run == Some(true) { + println!("Dryrun requested. Planned {total_moves} moves:"); + for mv in &moves { + println!("{}: {} -> {}", mv.tenant_shard_id, mv.from, mv.to) + } + + return Ok(()); + } + + const DEFAULT_MIGRATE_CONCURRENCY: usize = 8; + let mut stream = futures::stream::iter(moves) + .map(|mv| { + let client = Client::new(cli.api.clone(), cli.jwt.clone()); + async move { + client + .dispatch::( + Method::PUT, + format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id), + Some(TenantShardMigrateRequest { + tenant_shard_id: mv.tenant_shard_id, + node_id: mv.to, + }), + ) + .await + .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e)) + } + }) + .buffered(concurrency.unwrap_or(DEFAULT_MIGRATE_CONCURRENCY)); + + let mut success = 0; + let mut failure = 0; + + while let Some(res) = stream.next().await { + match res { + Ok(_) => { + success += 1; + } + Err((tenant_shard_id, from, to, error)) => { + failure += 1; + println!( + "Failed to migrate {} from node {} to node {}: {}", + tenant_shard_id, from, to, error + ); + } + } + + if (success + failure) % 20 == 0 { + println!( + "Processed {}/{} shards: {} succeeded, {} failed", + success + failure, + total_moves, + success, + failure + ); + } + } + + println!( + "Processed {}/{} shards: {} succeeded, {} failed", + success + failure, + total_moves, + success, + failure + ); + } } Ok(()) diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile index 974dcd7f03..8378f37b48 100644 --- a/docker-compose/compute_wrapper/Dockerfile +++ b/docker-compose/compute_wrapper/Dockerfile @@ -8,6 +8,11 @@ USER root RUN apt-get update && \ apt-get install -y curl \ jq \ + python3-pip \ netcat +#Faker is required for the pg_anon test +RUN pip3 install Faker +#This is required for the pg_hintplan test +RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src -USER postgres +USER postgres \ No newline at end of file diff --git a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json index ccf0a91b90..8e582e74e1 100644 --- a/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json +++ b/docker-compose/compute_wrapper/var/db/postgres/specs/spec.json @@ -95,7 +95,7 @@ }, { "name": "shared_preload_libraries", - "value": "neon", + "value": "neon,pg_cron,timescaledb,pg_stat_statements", "vartype": "string" }, { @@ -127,6 +127,16 @@ "name": "max_replication_flush_lag", "value": "10GB", "vartype": "string" + }, + { + "name": "cron.database", + "value": "postgres", + "vartype": "string" + }, + { + "name": "session_preload_libraries", + "value": "anon", + "vartype": "string" } ] }, diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 9777d1fdd2..3f097f2700 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -1,5 +1,3 @@ -version: '3' - services: minio: restart: always @@ -161,12 +159,12 @@ services: context: ./compute_wrapper/ args: - REPOSITORY=${REPOSITORY:-neondatabase} - - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14} + - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-16} - TAG=${TAG:-latest} - http_proxy=$http_proxy - https_proxy=$https_proxy environment: - - PG_VERSION=${PG_VERSION:-14} + - PG_VERSION=${PG_VERSION:-16} #- RUST_BACKTRACE=1 # Mount the test files directly, for faster editing cycle. volumes: @@ -194,3 +192,13 @@ services: done" depends_on: - compute + + neon-test-extensions: + image: ${REPOSITORY:-neondatabase}/neon-test-extensions-v${PG_TEST_VERSION:-16}:${TAG:-latest} + entrypoint: + - "/bin/bash" + - "-c" + command: + - sleep 1800 + depends_on: + - compute diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh index 062fc6fc92..71e73c2d0a 100755 --- a/docker-compose/docker_compose_test.sh +++ b/docker-compose/docker_compose_test.sh @@ -7,15 +7,21 @@ # Implicitly accepts `REPOSITORY` and `TAG` env vars that are passed into the compose file # Their defaults point at DockerHub `neondatabase/neon:latest` image.`, # to verify custom image builds (e.g pre-published ones). - +# +# A test script for postgres extensions +# Currently supports only v16 +# set -eux -o pipefail -SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -COMPOSE_FILE=$SCRIPT_DIR/docker-compose.yml - +COMPOSE_FILE='docker-compose.yml' +cd $(dirname $0) +docker compose -f $COMPOSE_FILE COMPUTE_CONTAINER_NAME=docker-compose-compute-1 -SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;" -PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres" +TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1 +PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres" +: ${http_proxy:=} +: ${https_proxy:=} +export http_proxy https_proxy cleanup() { echo "show container information" @@ -25,34 +31,71 @@ cleanup() { docker compose -f $COMPOSE_FILE down } -echo "clean up containers if exists" -cleanup - for pg_version in 14 15 16; do - echo "start containers (pg_version=$pg_version)." - PG_VERSION=$pg_version docker compose -f $COMPOSE_FILE up --build -d + echo "clean up containers if exists" + cleanup + PG_TEST_VERSION=$(($pg_version < 16 ? 16 : $pg_version)) + PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose -f $COMPOSE_FILE up --build -d echo "wait until the compute is ready. timeout after 60s. " cnt=0 - while sleep 1; do + while sleep 3; do # check timeout - cnt=`expr $cnt + 1` + cnt=`expr $cnt + 3` if [ $cnt -gt 60 ]; then echo "timeout before the compute is ready." cleanup exit 1 fi - - # check if the compute is ready - set +o pipefail - result=`docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l` - set -o pipefail - if [ $result -eq 1 ]; then + if docker compose -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then echo "OK. The compute is ready to connect." echo "execute simple queries." docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION" - cleanup break fi done + + if [ $pg_version -ge 16 ] + then + echo Enabling trust connection + docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' " + echo Adding postgres role + docker exec $COMPUTE_CONTAINER_NAME psql $PSQL_OPTION -c "CREATE ROLE postgres SUPERUSER LOGIN" + # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail + # It cannot be moved to Dockerfile now because the database directory is created after the start of the container + echo Adding dummy config + docker exec $COMPUTE_CONTAINER_NAME touch /var/db/postgres/compute/compute_ctl_temp_override.conf + # This block is required for the pg_anon extension test. + # The test assumes that it is running on the same host with the postgres engine. + # In our case it's not true, that's why we are copying files to the compute node + TMPDIR=$(mktemp -d) + docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data + echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv + docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data + rm -rf $TMPDIR + TMPDIR=$(mktemp -d) + # The following block does the same for the pg_hintplan test + docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data + docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/ + rm -rf $TMPDIR + # We are running tests now + if docker exec -e SKIP=rum-src,timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,pg_graphql-src,kq_imcx-src,wal2json_2_5-src \ + $TEST_CONTAINER_NAME /run-tests.sh | tee testout.txt + then + cleanup + else + FAILED=$(tail -1 testout.txt) + for d in $FAILED + do + mkdir $d + docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.diffs $d || true + docker cp $TEST_CONTAINER_NAME:/ext-src/$d/regression.out $d || true + cat $d/regression.out $d/regression.diffs || true + done + rm -rf $FAILED + cleanup + exit 1 + fi + fi + cleanup done diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh new file mode 100644 index 0000000000..c05fc159aa --- /dev/null +++ b/docker-compose/run-tests.sh @@ -0,0 +1,15 @@ +#!/bin/bash +set -x + +cd /ext-src +FAILED= +LIST=$((echo ${SKIP} | sed 's/,/\n/g'; ls -d *-src) | sort | uniq -u) +for d in ${LIST} +do + [ -d ${d} ] || continue + psql -c "select 1" >/dev/null || break + make -C ${d} installcheck || FAILED="${d} ${FAILED}" +done +[ -z "${FAILED}" ] && exit 0 +echo ${FAILED} +exit 1 \ No newline at end of file diff --git a/docs/docker.md b/docs/docker.md index cbf68be3a7..ccd2afc27a 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -34,12 +34,12 @@ You can see a [docker compose](https://docs.docker.com/compose/) example to crea 1. create containers You can specify version of neon cluster using following environment values. -- PG_VERSION: postgres version for compute (default is 14) +- PG_VERSION: postgres version for compute (default is 16) - TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml) ``` $ cd docker-compose/ $ docker-compose down # remove the containers if exists -$ PG_VERSION=15 TAG=2937 docker-compose up --build -d # You can specify the postgres and image version +$ PG_VERSION=16 TAG=2937 docker-compose up --build -d # You can specify the postgres and image version Creating network "dockercompose_default" with the default driver Creating docker-compose_storage_broker_1 ... done (...omit...) diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs index a33cae6769..20018846f8 100644 --- a/pageserver/ctl/src/index_part.rs +++ b/pageserver/ctl/src/index_part.rs @@ -1,11 +1,6 @@ -use std::collections::HashMap; - use anyhow::Context; use camino::Utf8PathBuf; -use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata; -use pageserver::tenant::storage_layer::LayerName; -use pageserver::tenant::{metadata::TimelineMetadata, IndexPart}; -use utils::lsn::Lsn; +use pageserver::tenant::IndexPart; #[derive(clap::Subcommand)] pub(crate) enum IndexPartCmd { @@ -17,20 +12,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> { IndexPartCmd::Dump { path } => { let bytes = tokio::fs::read(path).await.context("read file")?; let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?; - #[derive(serde::Serialize)] - struct Output<'a> { - layer_metadata: &'a HashMap, - disk_consistent_lsn: Lsn, - timeline_metadata: &'a TimelineMetadata, - } - - let output = Output { - layer_metadata: &des.layer_metadata, - disk_consistent_lsn: des.metadata.disk_consistent_lsn(), - timeline_metadata: &des.metadata, - }; - - let output = serde_json::to_string_pretty(&output).context("serialize output")?; + let output = serde_json::to_string_pretty(&des).context("serialize output")?; println!("{output}"); Ok(()) } diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 60cd5c9695..f9ed6d3071 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -3395,6 +3395,12 @@ impl Tenant { let tenant_shard_id = raw_timeline.owning_tenant.tenant_shard_id; let unfinished_timeline = raw_timeline.raw_timeline()?; + // Flush the new layer files to disk, before we make the timeline as available to + // the outside world. + // + // Flush loop needs to be spawned in order to be able to flush. + unfinished_timeline.maybe_spawn_flush_loop(); + import_datadir::import_timeline_from_postgres_datadir( unfinished_timeline, &pgdata_path, @@ -3406,12 +3412,6 @@ impl Tenant { format!("Failed to import pgdatadir for timeline {tenant_shard_id}/{timeline_id}") })?; - // Flush the new layer files to disk, before we make the timeline as available to - // the outside world. - // - // Flush loop needs to be spawned in order to be able to flush. - unfinished_timeline.maybe_spawn_flush_loop(); - fail::fail_point!("before-checkpoint-new-timeline", |_| { anyhow::bail!("failpoint before-checkpoint-new-timeline"); }); @@ -4044,10 +4044,12 @@ mod tests { use crate::DEFAULT_PG_VERSION; use bytes::{Bytes, BytesMut}; use hex_literal::hex; + use itertools::Itertools; use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings}; use rand::{thread_rng, Rng}; + use storage_layer::PersistentLayerKey; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; use utils::bin_ser::BeSer; @@ -6584,8 +6586,8 @@ mod tests { } #[tokio::test] - async fn test_metadata_tombstone_image_creation() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?; + async fn test_metadata_tombstone_image_creation() { + let harness = TenantHarness::create("test_metadata_tombstone_image_creation").unwrap(); let (tenant, ctx) = harness.load().await; let key0 = Key::from_hex("620000000033333333444444445500000000").unwrap(); @@ -6613,7 +6615,8 @@ mod tests { vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])], Lsn(0x30), ) - .await?; + .await + .unwrap(); let cancel = CancellationToken::new(); @@ -6628,23 +6631,24 @@ mod tests { }, &ctx, ) - .await?; + .await + .unwrap(); // Image layers are created at last_record_lsn let images = tline .inspect_image_layers(Lsn(0x30), &ctx) - .await? + .await + .unwrap() .into_iter() .filter(|(k, _)| k.is_metadata_key()) .collect::>(); assert_eq!(images.len(), 2); // the image layer should only contain two existing keys, tombstones should be removed. - - Ok(()) } #[tokio::test] - async fn test_metadata_tombstone_empty_image_creation() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_metadata_tombstone_image_creation")?; + async fn test_metadata_tombstone_empty_image_creation() { + let harness = + TenantHarness::create("test_metadata_tombstone_empty_image_creation").unwrap(); let (tenant, ctx) = harness.load().await; let key1 = Key::from_hex("620000000033333333444444445500000001").unwrap(); @@ -6666,7 +6670,8 @@ mod tests { vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])], Lsn(0x30), ) - .await?; + .await + .unwrap(); let cancel = CancellationToken::new(); @@ -6681,16 +6686,174 @@ mod tests { }, &ctx, ) - .await?; + .await + .unwrap(); // Image layers are created at last_record_lsn let images = tline .inspect_image_layers(Lsn(0x30), &ctx) - .await? + .await + .unwrap() .into_iter() .filter(|(k, _)| k.is_metadata_key()) .collect::>(); assert_eq!(images.len(), 0); // the image layer should not contain tombstones, or it is not created + } + + #[tokio::test] + async fn test_simple_bottom_most_compaction() -> anyhow::Result<()> { + let harness = TenantHarness::create("test_simple_bottom_most_compaction")?; + let (tenant, ctx) = harness.load().await; + + fn get_key(id: u32) -> Key { + // using aux key here b/c they are guaranteed to be inside `collect_keyspace`. + let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + + // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon. + // + // | D1 | | D3 | + // -| |-- gc horizon ----------------- + // | | | D2 | + // --------- img layer ------------------ + // + // What we should expact from this compaction is: + // | Part of D1 | | D3 | + // --------- img layer with D1+D2 at GC horizon------------------ + + // img layer at 0x10 + let img_layer = (0..10) + .map(|id| (get_key(id), test_img(&format!("value {id}@0x10")))) + .collect_vec(); + + let delta1 = vec![ + // TODO: we should test a real delta record here, which requires us to add a variant of NeonWalRecord for testing purpose. + ( + get_key(1), + Lsn(0x20), + Value::Image(test_img("value 1@0x20")), + ), + ( + get_key(2), + Lsn(0x30), + Value::Image(test_img("value 2@0x30")), + ), + ( + get_key(3), + Lsn(0x40), + Value::Image(test_img("value 3@0x40")), + ), + ]; + let delta2 = vec![ + ( + get_key(5), + Lsn(0x20), + Value::Image(test_img("value 5@0x20")), + ), + ( + get_key(6), + Lsn(0x20), + Value::Image(test_img("value 6@0x20")), + ), + ]; + let delta3 = vec![ + ( + get_key(8), + Lsn(0x40), + Value::Image(test_img("value 8@0x40")), + ), + ( + get_key(9), + Lsn(0x40), + Value::Image(test_img("value 9@0x40")), + ), + ]; + + let tline = tenant + .create_test_timeline_with_layers( + TIMELINE_ID, + Lsn(0x10), + DEFAULT_PG_VERSION, + &ctx, + vec![delta1, delta2, delta3], // delta layers + vec![(Lsn(0x10), img_layer)], // image layers + Lsn(0x50), + ) + .await?; + { + // Update GC info + let mut guard = tline.gc_info.write().unwrap(); + guard.cutoffs.pitr = Lsn(0x30); + guard.cutoffs.horizon = Lsn(0x30); + } + + let cancel = CancellationToken::new(); + tline.compact_with_gc(&cancel, &ctx).await.unwrap(); + + // Check if the image layer at the GC horizon contains exactly what we want + let image_at_gc_horizon = tline + .inspect_image_layers(Lsn(0x30), &ctx) + .await + .unwrap() + .into_iter() + .filter(|(k, _)| k.is_metadata_key()) + .collect::>(); + + assert_eq!(image_at_gc_horizon.len(), 10); + let expected_lsn = [0x10, 0x20, 0x30, 0x10, 0x10, 0x20, 0x20, 0x10, 0x10, 0x10]; + for idx in 0..10 { + assert_eq!( + image_at_gc_horizon[idx], + ( + get_key(idx as u32), + test_img(&format!("value {idx}@{:#x}", expected_lsn[idx])) + ) + ); + } + + // Check if old layers are removed / new layers have the expected LSN + let mut all_layers = tline.inspect_historic_layers().await.unwrap(); + all_layers.sort_by(|k1, k2| { + ( + k1.is_delta, + k1.key_range.start, + k1.key_range.end, + k1.lsn_range.start, + k1.lsn_range.end, + ) + .cmp(&( + k2.is_delta, + k2.key_range.start, + k2.key_range.end, + k2.lsn_range.start, + k2.lsn_range.end, + )) + }); + assert_eq!( + all_layers, + vec![ + // Image layer at GC horizon + PersistentLayerKey { + key_range: Key::MIN..get_key(10), + lsn_range: Lsn(0x30)..Lsn(0x31), + is_delta: false + }, + // The delta layer that is cut in the middle + PersistentLayerKey { + key_range: Key::MIN..get_key(9), + lsn_range: Lsn(0x30)..Lsn(0x41), + is_delta: true + }, + // The delta layer we created and should not be picked for the compaction + PersistentLayerKey { + key_range: get_key(8)..get_key(10), + lsn_range: Lsn(0x40)..Lsn(0x41), + is_delta: true + } + ] + ); Ok(()) } diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index c00672895a..6ba1bdef9b 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -1,15 +1,23 @@ -//! Every image of a certain timeline from [`crate::tenant::Tenant`] -//! has a metadata that needs to be stored persistently. +//! Describes the legacy now hopefully no longer modified per-timeline metadata stored in +//! `index_part.json` managed by [`remote_timeline_client`]. For many tenants and their timelines, +//! this struct and it's original serialization format is still needed because they were written a +//! long time ago. //! -//! Later, the file gets used in [`remote_timeline_client`] as a part of -//! external storage import and export operations. +//! Instead of changing and adding versioning to this, just change [`IndexPart`] with soft json +//! versioning. //! -//! The module contains all structs and related helper methods related to timeline metadata. +//! To clean up this module we need to migrate all index_part.json files to a later version. +//! While doing this, we need to be mindful about s3 based recovery as well, so it might take +//! however long we keep the old versions to be able to delete the old code. After that, we can +//! remove everything else than [`TimelineMetadataBodyV2`], rename it as `TimelineMetadata` and +//! move it to `index.rs`. Before doing all of this, we need to keep the structures for backwards +//! compatibility. //! //! [`remote_timeline_client`]: super::remote_timeline_client +//! [`IndexPart`]: super::remote_timeline_client::index::IndexPart use anyhow::ensure; -use serde::{de::Error, Deserialize, Serialize, Serializer}; +use serde::{Deserialize, Serialize}; use utils::bin_ser::SerializeError; use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn}; @@ -17,17 +25,37 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn}; const METADATA_FORMAT_VERSION: u16 = 4; /// Previous supported format versions. +/// +/// In practice, none of these should remain, all are [`METADATA_FORMAT_VERSION`], but confirming +/// that requires a scrubber run which is yet to be done. const METADATA_OLD_FORMAT_VERSION: u16 = 3; -/// We assume that a write of up to METADATA_MAX_SIZE bytes is atomic. +/// When the file existed on disk we assumed that a write of up to METADATA_MAX_SIZE bytes is atomic. /// /// This is the same assumption that PostgreSQL makes with the control file, +/// /// see PG_CONTROL_MAX_SAFE_SIZE const METADATA_MAX_SIZE: usize = 512; -/// Metadata stored on disk for each timeline +/// Legacy metadata stored as a component of `index_part.json` per timeline. /// -/// The fields correspond to the values we hold in memory, in Timeline. +/// Do not make new changes to this type or the module. In production, we have two different kinds +/// of serializations of this type: bincode and json. Bincode version reflects what used to be +/// stored on disk in earlier versions and does internal crc32 checksumming. +/// +/// This type should not implement `serde::Serialize` or `serde::Deserialize` because there would +/// be a confusion whether you want the old version ([`TimelineMetadata::from_bytes`]) or the modern +/// as-exists in `index_part.json` ([`self::modern_serde`]). +/// +/// ```compile_fail +/// #[derive(serde::Serialize)] +/// struct DoNotDoThis(pageserver::tenant::metadata::TimelineMetadata); +/// ``` +/// +/// ```compile_fail +/// #[derive(serde::Deserialize)] +/// struct NeitherDoThis(pageserver::tenant::metadata::TimelineMetadata); +/// ``` #[derive(Debug, Clone, PartialEq, Eq)] pub struct TimelineMetadata { hdr: TimelineMetadataHeader, @@ -40,6 +68,49 @@ struct TimelineMetadataHeader { size: u16, // size of serialized metadata format_version: u16, // metadata format version (used for compatibility checks) } + +impl TryFrom<&TimelineMetadataBodyV2> for TimelineMetadataHeader { + type Error = Crc32CalculationFailed; + + fn try_from(value: &TimelineMetadataBodyV2) -> Result { + #[derive(Default)] + struct Crc32Sink { + crc: u32, + count: usize, + } + + impl std::io::Write for Crc32Sink { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.crc = crc32c::crc32c_append(self.crc, buf); + self.count += buf.len(); + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } + } + + // jump through hoops to calculate the crc32 so that TimelineMetadata::ne works + // across serialization versions + let mut sink = Crc32Sink::default(); + ::ser_into(value, &mut sink) + .map_err(Crc32CalculationFailed)?; + + let size = METADATA_HDR_SIZE + sink.count; + + Ok(TimelineMetadataHeader { + checksum: sink.crc, + size: size as u16, + format_version: METADATA_FORMAT_VERSION, + }) + } +} + +#[derive(thiserror::Error, Debug)] +#[error("re-serializing for crc32 failed")] +struct Crc32CalculationFailed(#[source] utils::bin_ser::SerializeError); + const METADATA_HDR_SIZE: usize = std::mem::size_of::(); #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -111,6 +182,12 @@ impl TimelineMetadata { } } + #[cfg(test)] + pub(crate) fn with_recalculated_checksum(mut self) -> anyhow::Result { + self.hdr = TimelineMetadataHeader::try_from(&self.body)?; + Ok(self) + } + fn upgrade_timeline_metadata(metadata_bytes: &[u8]) -> anyhow::Result { let mut hdr = TimelineMetadataHeader::des(&metadata_bytes[0..METADATA_HDR_SIZE])?; @@ -261,32 +338,8 @@ impl TimelineMetadata { } } -impl<'de> Deserialize<'de> for TimelineMetadata { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - let bytes = Vec::::deserialize(deserializer)?; - Self::from_bytes(bytes.as_slice()).map_err(D::Error::custom) - } -} - -impl Serialize for TimelineMetadata { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - let bytes = self.to_bytes().map_err(serde::ser::Error::custom)?; - bytes.serialize(serializer) - } -} - pub(crate) mod modern_serde { - use crate::tenant::metadata::METADATA_FORMAT_VERSION; - - use super::{ - TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader, METADATA_HDR_SIZE, - }; + use super::{TimelineMetadata, TimelineMetadataBodyV2, TimelineMetadataHeader}; use serde::{Deserialize, Serialize}; pub(crate) fn deserialize<'de, D>(deserializer: D) -> Result @@ -322,71 +375,15 @@ pub(crate) mod modern_serde { let de = serde::de::value::MapAccessDeserializer::new(map); let body = TimelineMetadataBodyV2::deserialize(de)?; + let hdr = TimelineMetadataHeader::try_from(&body).map_err(A::Error::custom)?; - // jump through hoops to calculate the crc32 so that TimelineMetadata::ne works - // across serialization versions - let mut sink = Crc32Sink::default(); - ::ser_into(&body, &mut sink) - .map_err(|e| A::Error::custom(Crc32CalculationFailed(e)))?; - - let size = METADATA_HDR_SIZE + sink.count; - - Ok(TimelineMetadata { - hdr: TimelineMetadataHeader { - checksum: sink.crc, - size: size as u16, - format_version: METADATA_FORMAT_VERSION, - }, - body, - }) + Ok(TimelineMetadata { hdr, body }) } } deserializer.deserialize_any(Visitor) } - #[derive(Default)] - struct Crc32Sink { - crc: u32, - count: usize, - } - - impl std::io::Write for Crc32Sink { - fn write(&mut self, buf: &[u8]) -> std::io::Result { - self.crc = crc32c::crc32c_append(self.crc, buf); - self.count += buf.len(); - Ok(buf.len()) - } - - fn flush(&mut self) -> std::io::Result<()> { - Ok(()) - } - } - - #[derive(thiserror::Error)] - #[error("re-serializing for crc32 failed")] - struct Crc32CalculationFailed(#[source] E); - - // this should be true for one release, after that we can change it to false - // remember to check the IndexPart::metadata field TODO comment as well - const LEGACY_BINCODED_BYTES: bool = true; - - #[derive(serde::Serialize)] - #[serde(transparent)] - struct LegacyPaddedBytes<'a>(&'a TimelineMetadata); - - struct JustTheBodyV2<'a>(&'a TimelineMetadata); - - impl serde::Serialize for JustTheBodyV2<'_> { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - // header is not needed, upon reading we've upgraded all v1 to v2 - self.0.body.serialize(serializer) - } - } - pub(crate) fn serialize( metadata: &TimelineMetadata, serializer: S, @@ -394,25 +391,23 @@ pub(crate) mod modern_serde { where S: serde::Serializer, { - // we cannot use TimelineMetadata::serialize for now because it'll do - // TimelineMetadata::to_bytes - if LEGACY_BINCODED_BYTES { - LegacyPaddedBytes(metadata).serialize(serializer) - } else { - JustTheBodyV2(metadata).serialize(serializer) - } + // header is not needed, upon reading we've upgraded all v1 to v2 + metadata.body.serialize(serializer) } #[test] fn deserializes_bytes_as_well_as_equivalent_body_v2() { #[derive(serde::Deserialize, serde::Serialize)] - struct Wrapper(#[serde(deserialize_with = "deserialize")] TimelineMetadata); + struct Wrapper( + #[serde(deserialize_with = "deserialize", serialize_with = "serialize")] + TimelineMetadata, + ); let too_many_bytes = "[216,111,252,208,0,54,0,4,0,0,0,0,1,73,253,144,1,0,0,0,0,1,73,253,24,0,0,0,0,0,0,0,0,0,0,0,0,0,1,73,253,24,0,0,0,0,1,73,253,24,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]"; let wrapper_from_bytes = serde_json::from_str::(too_many_bytes).unwrap(); - let serialized = serde_json::to_value(JustTheBodyV2(&wrapper_from_bytes.0)).unwrap(); + let serialized = serde_json::to_value(&wrapper_from_bytes).unwrap(); assert_eq!( serialized, @@ -553,59 +548,6 @@ mod tests { ); } - #[test] - fn test_metadata_bincode_serde() { - let original_metadata = TimelineMetadata::new( - Lsn(0x200), - Some(Lsn(0x100)), - Some(TIMELINE_ID), - Lsn(0), - Lsn(0), - Lsn(0), - // Any version will do here, so use the default - crate::DEFAULT_PG_VERSION, - ); - let metadata_bytes = original_metadata - .to_bytes() - .expect("Cannot create bytes array from metadata"); - - let metadata_bincode_be_bytes = original_metadata - .ser() - .expect("Cannot serialize the metadata"); - - // 8 bytes for the length of the vector - assert_eq!(metadata_bincode_be_bytes.len(), 8 + metadata_bytes.len()); - - let expected_bincode_bytes = { - let mut temp = vec![]; - let len_bytes = metadata_bytes.len().to_be_bytes(); - temp.extend_from_slice(&len_bytes); - temp.extend_from_slice(&metadata_bytes); - temp - }; - assert_eq!(metadata_bincode_be_bytes, expected_bincode_bytes); - - let deserialized_metadata = TimelineMetadata::des(&metadata_bincode_be_bytes).unwrap(); - // Deserialized metadata has the metadata header, which is different from the serialized one. - // Reference: TimelineMetaData::to_bytes() - let expected_metadata = { - let mut temp_metadata = original_metadata; - let body_bytes = temp_metadata - .body - .ser() - .expect("Cannot serialize the metadata body"); - let metadata_size = METADATA_HDR_SIZE + body_bytes.len(); - let hdr = TimelineMetadataHeader { - size: metadata_size as u16, - format_version: METADATA_FORMAT_VERSION, - checksum: crc32c::crc32c(&body_bytes), - }; - temp_metadata.hdr = hdr; - temp_metadata - }; - assert_eq!(deserialized_metadata, expected_metadata); - } - #[test] fn test_metadata_bincode_serde_ensure_roundtrip() { let original_metadata = TimelineMetadata::new( @@ -619,8 +561,6 @@ mod tests { crate::DEFAULT_PG_VERSION, ); let expected_bytes = vec![ - /* bincode length encoding bytes */ - 0, 0, 0, 0, 0, 0, 2, 0, // 8 bytes for the length of the serialized vector /* TimelineMetadataHeader */ 4, 37, 101, 34, 0, 70, 0, 4, // checksum, size, format_version (4 + 2 + 2) /* TimelineMetadataBodyV2 */ @@ -650,7 +590,7 @@ mod tests { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; - let metadata_ser_bytes = original_metadata.ser().unwrap(); + let metadata_ser_bytes = original_metadata.to_bytes().unwrap(); assert_eq!(metadata_ser_bytes, expected_bytes); let expected_metadata = { @@ -668,7 +608,7 @@ mod tests { temp_metadata.hdr = hdr; temp_metadata }; - let des_metadata = TimelineMetadata::des(&metadata_ser_bytes).unwrap(); + let des_metadata = TimelineMetadata::from_bytes(&metadata_ser_bytes).unwrap(); assert_eq!(des_metadata, expected_metadata); } } diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 7d2e9b9a91..6233a3477e 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -38,14 +38,17 @@ pub struct IndexPart { /// that latest version stores. pub layer_metadata: HashMap, - // 'disk_consistent_lsn' is a copy of the 'disk_consistent_lsn' in the metadata. - // It's duplicated for convenience when reading the serialized structure, but is - // private because internally we would read from metadata instead. + /// Because of the trouble of eyeballing the legacy "metadata" field, we copied the + /// "disk_consistent_lsn" out. After version 7 this is no longer needed, but the name cannot be + /// reused. pub(super) disk_consistent_lsn: Lsn, - // TODO: later make this "rename" to "alias", rename field as "legacy_metadata" + // TODO: rename as "metadata" next week, keep the alias = "metadata_bytes", bump version Adding + // the "alias = metadata" was forgotten in #7693, so we have to use "rewrite = metadata_bytes" + // for backwards compatibility. #[serde( rename = "metadata_bytes", + alias = "metadata", with = "crate::tenant::metadata::modern_serde" )] pub metadata: TimelineMetadata, @@ -76,10 +79,11 @@ impl IndexPart { /// - 4: timeline_layers is fully removed. /// - 5: lineage was added /// - 6: last_aux_file_policy is added. - const LATEST_VERSION: usize = 6; + /// - 7: metadata_bytes is no longer written, but still read + const LATEST_VERSION: usize = 7; // Versions we may see when reading from a bucket. - pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6]; + pub const KNOWN_VERSIONS: &'static [usize] = &[1, 2, 3, 4, 5, 6, 7]; pub const FILE_NAME: &'static str = "index_part.json"; @@ -95,7 +99,7 @@ impl IndexPart { } } - pub fn get_version(&self) -> usize { + pub fn version(&self) -> usize { self.version } @@ -217,9 +221,9 @@ impl Lineage { #[cfg(test)] mod tests { - use std::str::FromStr; - use super::*; + use std::str::FromStr; + use utils::id::TimelineId; #[test] fn v1_indexpart_is_parsed() { @@ -338,8 +342,7 @@ mod tests { ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), - deleted_at: Some(chrono::NaiveDateTime::parse_from_str( - "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()), + deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), lineage: Lineage::default(), last_aux_file_policy: None, }; @@ -515,8 +518,7 @@ mod tests { ]), disk_consistent_lsn: "0/16960E8".parse::().unwrap(), metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(), - deleted_at: Some(chrono::NaiveDateTime::parse_from_str( - "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()), + deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), lineage: Lineage { reparenting_history_truncated: false, reparenting_history: vec![TimelineId::from_str("e1bfd8c633d713d279e6fcd2bcc15b6d").unwrap()], @@ -529,6 +531,60 @@ mod tests { assert_eq!(part, expected); } + #[test] + fn v7_indexpart_is_parsed() { + let example = r#"{ + "version": 7, + "layer_metadata":{ + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 }, + "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 } + }, + "disk_consistent_lsn":"0/16960E8", + "metadata": { + "disk_consistent_lsn": "0/16960E8", + "prev_record_lsn": "0/1696070", + "ancestor_timeline": "e45a7f37d3ee2ff17dc14bf4f4e3f52e", + "ancestor_lsn": "0/0", + "latest_gc_cutoff_lsn": "0/1696070", + "initdb_lsn": "0/1696070", + "pg_version": 14 + }, + "deleted_at": "2023-07-31T09:00:00.123" + }"#; + + let expected = IndexPart { + version: 7, + layer_metadata: HashMap::from([ + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), LayerFileMetadata { + file_size: 25600000, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }), + ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), LayerFileMetadata { + file_size: 9007199254741001, + generation: Generation::none(), + shard: ShardIndex::unsharded() + }) + ]), + disk_consistent_lsn: "0/16960E8".parse::().unwrap(), + metadata: TimelineMetadata::new( + Lsn::from_str("0/16960E8").unwrap(), + Some(Lsn::from_str("0/1696070").unwrap()), + Some(TimelineId::from_str("e45a7f37d3ee2ff17dc14bf4f4e3f52e").unwrap()), + Lsn::INVALID, + Lsn::from_str("0/1696070").unwrap(), + Lsn::from_str("0/1696070").unwrap(), + 14, + ).with_recalculated_checksum().unwrap(), + deleted_at: Some(parse_naive_datetime("2023-07-31T09:00:00.123000000")), + lineage: Default::default(), + last_aux_file_policy: Default::default(), + }; + + let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap(); + assert_eq!(part, expected); + } + fn parse_naive_datetime(s: &str) -> NaiveDateTime { chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f").unwrap() } diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 999e2e8679..eb7cf81643 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -929,6 +929,45 @@ impl DeltaLayerInner { Ok(()) } + /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future. + #[cfg(test)] + pub(super) async fn load_key_values( + &self, + ctx: &RequestContext, + ) -> anyhow::Result> { + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let index_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( + self.index_start_blk, + self.index_root_blk, + block_reader, + ); + let mut result = Vec::new(); + let mut stream = + Box::pin(self.stream_index_forwards(&index_reader, &[0; DELTA_KEY_SIZE], ctx)); + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let cursor = block_reader.block_cursor(); + let mut buf = Vec::new(); + while let Some(item) = stream.next().await { + let (key, lsn, pos) = item?; + // TODO: dedup code with get_reconstruct_value + // TODO: ctx handling and sharding + cursor + .read_blob_into_buf(pos.pos(), &mut buf, ctx) + .await + .with_context(|| { + format!("Failed to read blob from virtual file {}", self.file.path) + })?; + let val = Value::des(&buf).with_context(|| { + format!( + "Failed to deserialize file blob from virtual file {}", + self.file.path + ) + })?; + result.push((key, lsn, val)); + } + Ok(result) + } + async fn plan_reads( keyspace: &KeySpace, lsn_range: Range, diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 285618b146..06e2f09384 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -485,6 +485,34 @@ impl ImageLayerInner { Ok(()) } + /// Load all key-values in the delta layer, should be replaced by an iterator-based interface in the future. + #[cfg(test)] + pub(super) async fn load_key_values( + &self, + ctx: &RequestContext, + ) -> anyhow::Result> { + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = + DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, &block_reader); + let mut result = Vec::new(); + let mut stream = Box::pin(tree_reader.get_stream_from(&[0; KEY_SIZE], ctx)); + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let cursor = block_reader.block_cursor(); + while let Some(item) = stream.next().await { + // TODO: dedup code with get_reconstruct_value + let (raw_key, offset) = item?; + let key = Key::from_slice(&raw_key[..KEY_SIZE]); + // TODO: ctx handling and sharding + let blob = cursor + .read_blob(offset, ctx) + .await + .with_context(|| format!("failed to read value from offset {}", offset))?; + let value = Bytes::from(blob); + result.push((key, self.lsn, Value::Image(value))); + } + Ok(result) + } + /// Traverse the layer's index to build read operations on the overlap of the input keyspace /// and the keys in this layer. /// diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 9553f83026..1ecc56ce99 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -52,7 +52,7 @@ pub struct InMemoryLayer { /// Frozen layers have an exclusive end LSN. /// Writes are only allowed when this is `None`. - end_lsn: OnceLock, + pub(crate) end_lsn: OnceLock, /// Used for traversal path. Cached representation of the in-memory layer before frozen. local_path_str: Arc, diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 18f9ba4ef8..32acb3f0cd 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -388,6 +388,23 @@ impl Layer { }) } + /// Get all key/values in the layer. Should be replaced with an iterator-based API in the future. + #[cfg(test)] + pub(crate) async fn load_key_values( + &self, + ctx: &RequestContext, + ) -> anyhow::Result> { + let layer = self + .0 + .get_or_maybe_download(true, Some(ctx)) + .await + .map_err(|err| match err { + DownloadError::DownloadCancelled => GetVectoredError::Cancelled, + other => GetVectoredError::Other(anyhow::anyhow!(other)), + })?; + layer.load_key_values(&self.0, ctx).await + } + /// Download the layer if evicted. /// /// Will not error when the layer is already downloaded. @@ -1757,6 +1774,20 @@ impl DownloadedLayer { } } + #[cfg(test)] + async fn load_key_values( + &self, + owner: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result> { + use LayerKind::*; + + match self.get(owner, ctx).await? { + Delta(d) => d.load_key_values(ctx).await, + Image(i) => i.load_key_values(ctx).await, + } + } + async fn dump(&self, owner: &Arc, ctx: &RequestContext) -> anyhow::Result<()> { use LayerKind::*; match self.get(owner, ctx).await? { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 388d5b9d54..54a4ceeaf3 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -322,6 +322,8 @@ pub struct Timeline { /// Locked automatically by [`TimelineWriter`] and checkpointer. /// Must always be acquired before the layer map/individual layer lock /// to avoid deadlock. + /// + /// The state is cleared upon freezing. write_lock: tokio::sync::Mutex>, /// Used to avoid multiple `flush_loop` tasks running @@ -1569,7 +1571,15 @@ impl Timeline { // This exists to provide a non-span creating version of `freeze_and_flush` we can call without // polluting the span hierarchy. pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> { - let to_lsn = self.freeze_inmem_layer(false).await; + let to_lsn = { + // Freeze the current open in-memory layer. It will be written to disk on next + // iteration. + let mut g = self.write_lock.lock().await; + + let to_lsn = self.get_last_record_lsn(); + self.freeze_inmem_layer_at(to_lsn, &mut g).await; + to_lsn + }; self.flush_frozen_layers_and_wait(to_lsn).await } @@ -1578,7 +1588,7 @@ impl Timeline { // an ephemeral layer open forever when idle. It also freezes layers if the global limit on // ephemeral layer bytes has been breached. pub(super) async fn maybe_freeze_ephemeral_layer(&self) { - let Ok(_write_guard) = self.write_lock.try_lock() else { + let Ok(mut write_guard) = self.write_lock.try_lock() else { // If the write lock is held, there is an active wal receiver: rolling open layers // is their responsibility while they hold this lock. return; @@ -1655,24 +1665,35 @@ impl Timeline { self.last_freeze_at.load(), open_layer.get_opened_at(), ) { - match open_layer.info() { + let at_lsn = match open_layer.info() { InMemoryLayerInfo::Frozen { lsn_start, lsn_end } => { // We may reach this point if the layer was already frozen by not yet flushed: flushing // happens asynchronously in the background. tracing::debug!( "Not freezing open layer, it's already frozen ({lsn_start}..{lsn_end})" ); + None } InMemoryLayerInfo::Open { .. } => { // Upgrade to a write lock and freeze the layer drop(layers_guard); let mut layers_guard = self.layers.write().await; - layers_guard - .try_freeze_in_memory_layer(current_lsn, &self.last_freeze_at) + let froze = layers_guard + .try_freeze_in_memory_layer( + current_lsn, + &self.last_freeze_at, + &mut write_guard, + ) .await; + Some(current_lsn).filter(|_| froze) + } + }; + if let Some(lsn) = at_lsn { + let res: Result = self.flush_frozen_layers(lsn); + if let Err(e) = res { + tracing::info!("failed to flush frozen layer after background freeze: {e:#}"); } } - self.flush_frozen_layers(); } } @@ -2036,11 +2057,11 @@ impl Timeline { true } else if distance > 0 && opened_at.elapsed() >= self.get_checkpoint_timeout() { info!( - "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})", - projected_lsn, - layer_size, - opened_at.elapsed() - ); + "Will roll layer at {} with layer size {} due to time since first write to the layer ({:?})", + projected_lsn, + layer_size, + opened_at.elapsed() + ); true } else { @@ -2381,7 +2402,7 @@ impl Timeline { let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error); self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await; let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap(); - assert!(matches!(*flush_loop_state, FlushLoopState::Running{ ..})); + assert!(matches!(*flush_loop_state, FlushLoopState::Running{..})); *flush_loop_state = FlushLoopState::Exited; Ok(()) } @@ -3644,28 +3665,21 @@ impl Timeline { self.last_record_lsn.advance(new_lsn); } - /// Whether there was a layer to freeze or not, return the value of get_last_record_lsn - /// before we attempted the freeze: this guarantees that ingested data is frozen up to this lsn (inclusive). - async fn freeze_inmem_layer(&self, write_lock_held: bool) -> Lsn { - // Freeze the current open in-memory layer. It will be written to disk on next - // iteration. - - let _write_guard = if write_lock_held { - None - } else { - Some(self.write_lock.lock().await) + async fn freeze_inmem_layer_at( + &self, + at: Lsn, + write_lock: &mut tokio::sync::MutexGuard<'_, Option>, + ) { + let frozen = { + let mut guard = self.layers.write().await; + guard + .try_freeze_in_memory_layer(at, &self.last_freeze_at, write_lock) + .await }; - - let to_lsn = self.get_last_record_lsn(); - self.freeze_inmem_layer_at(to_lsn).await; - to_lsn - } - - async fn freeze_inmem_layer_at(&self, at: Lsn) { - let mut guard = self.layers.write().await; - guard - .try_freeze_in_memory_layer(at, &self.last_freeze_at) - .await; + if frozen { + let now = Instant::now(); + *(self.last_freeze_ts.write().unwrap()) = now; + } } /// Layer flusher task's main loop. @@ -3759,18 +3773,14 @@ impl Timeline { } } - /// Request the flush loop to write out all frozen layers up to `to_lsn` as Delta L0 files to disk. - /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer`]. + /// Request the flush loop to write out all frozen layers up to `at_lsn` as Delta L0 files to disk. + /// The caller is responsible for the freezing, e.g., [`Self::freeze_inmem_layer_at`]. /// - /// `last_record_lsn` may be higher than the highest LSN of a frozen layer: if this is the case, - /// it means no data will be written between the top of the highest frozen layer and to_lsn, - /// e.g. because this tenant shard has ingested up to to_lsn and not written any data locally for that part of the WAL. - async fn flush_frozen_layers_and_wait( - &self, - last_record_lsn: Lsn, - ) -> Result<(), FlushLayerError> { - let mut rx = self.layer_flush_done_tx.subscribe(); - + /// `at_lsn` may be higher than the highest LSN of a frozen layer: if this is the + /// case, it means no data will be written between the top of the highest frozen layer and + /// to_lsn, e.g. because this tenant shard has ingested up to to_lsn and not written any data + /// locally for that part of the WAL. + fn flush_frozen_layers(&self, at_lsn: Lsn) -> Result { // Increment the flush cycle counter and wake up the flush task. // Remember the new value, so that when we listen for the flush // to finish, we know when the flush that we initiated has @@ -3785,13 +3795,18 @@ impl Timeline { self.layer_flush_start_tx.send_modify(|(counter, lsn)| { my_flush_request = *counter + 1; *counter = my_flush_request; - *lsn = std::cmp::max(last_record_lsn, *lsn); + *lsn = std::cmp::max(at_lsn, *lsn); }); + Ok(my_flush_request) + } + + async fn wait_flush_completion(&self, request: u64) -> Result<(), FlushLayerError> { + let mut rx = self.layer_flush_done_tx.subscribe(); loop { { let (last_result_counter, last_result) = &*rx.borrow(); - if *last_result_counter >= my_flush_request { + if *last_result_counter >= request { if let Err(err) = last_result { // We already logged the original error in // flush_loop. We cannot propagate it to the caller @@ -3818,12 +3833,9 @@ impl Timeline { } } - fn flush_frozen_layers(&self) { - self.layer_flush_start_tx.send_modify(|(counter, lsn)| { - *counter += 1; - - *lsn = std::cmp::max(*lsn, Lsn(self.last_freeze_at.load().0 - 1)); - }); + async fn flush_frozen_layers_and_wait(&self, at_lsn: Lsn) -> Result<(), FlushLayerError> { + let token = self.flush_frozen_layers(at_lsn)?; + self.wait_flush_completion(token).await } /// Flush one frozen in-memory layer to disk, as a new delta layer. @@ -5537,10 +5549,26 @@ impl Timeline { all_data.sort(); Ok(all_data) } + + /// Get all historic layer descriptors in the layer map + #[cfg(test)] + pub(crate) async fn inspect_historic_layers( + self: &Arc, + ) -> anyhow::Result> { + let mut layers = Vec::new(); + let guard = self.layers.read().await; + for layer in guard.layer_map().iter_historic_layers() { + layers.push(layer.key()); + } + Ok(layers) + } } type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId); +/// Tracking writes ingestion does to a particular in-memory layer. +/// +/// Cleared upon freezing a layer. struct TimelineWriterState { open_layer: Arc, current_size: u64, @@ -5581,12 +5609,6 @@ impl Deref for TimelineWriter<'_> { } } -impl Drop for TimelineWriter<'_> { - fn drop(&mut self) { - self.write_guard.take(); - } -} - #[derive(PartialEq)] enum OpenLayerAction { Roll, @@ -5669,16 +5691,15 @@ impl<'a> TimelineWriter<'a> { } async fn roll_layer(&mut self, freeze_at: Lsn) -> anyhow::Result<()> { - assert!(self.write_guard.is_some()); - - self.tl.freeze_inmem_layer_at(freeze_at).await; - - let now = Instant::now(); - *(self.last_freeze_ts.write().unwrap()) = now; - - self.tl.flush_frozen_layers(); - let current_size = self.write_guard.as_ref().unwrap().current_size; + + // self.write_guard will be taken by the freezing + self.tl + .freeze_inmem_layer_at(freeze_at, &mut self.write_guard) + .await; + + self.tl.flush_frozen_layers(freeze_at)?; + if current_size >= self.get_checkpoint_distance() * 2 { warn!("Flushed oversized open layer with size {}", current_size) } @@ -5692,9 +5713,27 @@ impl<'a> TimelineWriter<'a> { return OpenLayerAction::Open; }; + #[cfg(feature = "testing")] + if state.cached_last_freeze_at < self.tl.last_freeze_at.load() { + // this check and assertion are not really needed because + // LayerManager::try_freeze_in_memory_layer will always clear out the + // TimelineWriterState if something is frozen. however, we can advance last_freeze_at when there + // is no TimelineWriterState. + assert!( + state.open_layer.end_lsn.get().is_some(), + "our open_layer must be outdated" + ); + + // this would be a memory leak waiting to happen because the in-memory layer always has + // an index + panic!("BUG: TimelineWriterState held on to frozen in-memory layer."); + } + if state.prev_lsn == Some(lsn) { - // Rolling mid LSN is not supported by downstream code. + // Rolling mid LSN is not supported by [downstream code]. // Hence, only roll at LSN boundaries. + // + // [downstream code]: https://github.com/neondatabase/neon/pull/7993#discussion_r1633345422 return OpenLayerAction::None; } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index d8de6aee7c..8a95029f33 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -952,6 +952,178 @@ impl Timeline { adaptor.flush_updates().await?; Ok(()) } + + /// An experimental compaction building block that combines compaction with garbage collection. + /// + /// The current implementation picks all delta + image layers that are below or intersecting with + /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta + /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon, + /// and create delta layers with all deltas >= gc horizon. + #[cfg(test)] + pub(crate) async fn compact_with_gc( + self: &Arc, + _cancel: &CancellationToken, + ctx: &RequestContext, + ) -> Result<(), CompactionError> { + use crate::tenant::storage_layer::ValueReconstructState; + // Step 0: pick all delta layers + image layers below/intersect with the GC horizon. + // The layer selection has the following properties: + // 1. If a layer is in the selection, all layers below it are in the selection. + // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection. + let (layer_selection, gc_cutoff) = { + let guard = self.layers.read().await; + let layers = guard.layer_map(); + let gc_info = self.gc_info.read().unwrap(); + let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr); + let mut selected_layers = Vec::new(); + // TODO: consider retain_lsns + drop(gc_info); + for desc in layers.iter_historic_layers() { + if desc.get_lsn_range().start <= gc_cutoff { + selected_layers.push(guard.get_from_desc(&desc)); + } + } + (selected_layers, gc_cutoff) + }; + // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs. + let mut all_key_values = Vec::new(); + for layer in &layer_selection { + all_key_values.extend(layer.load_key_values(ctx).await?); + } + // Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and + // image layers, make image appear later than delta. + struct ValueWrapper<'a>(&'a crate::repository::Value); + impl Ord for ValueWrapper<'_> { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + use crate::repository::Value; + use std::cmp::Ordering; + match (self.0, other.0) { + (Value::Image(_), Value::WalRecord(_)) => Ordering::Greater, + (Value::WalRecord(_), Value::Image(_)) => Ordering::Less, + _ => Ordering::Equal, + } + } + } + impl PartialOrd for ValueWrapper<'_> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + } + impl PartialEq for ValueWrapper<'_> { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == std::cmp::Ordering::Equal + } + } + impl Eq for ValueWrapper<'_> {} + all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| { + (k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2))) + }); + let max_lsn = all_key_values + .iter() + .map(|(_, lsn, _)| lsn) + .max() + .copied() + .unwrap() + + 1; + // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas. + // Data of the same key. + let mut accumulated_values = Vec::new(); + let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty + + /// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon. + async fn flush_accumulated_states( + tline: &Arc, + key: Key, + accumulated_values: &[&(Key, Lsn, crate::repository::Value)], + horizon: Lsn, + ) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> { + let mut base_image = None; + let mut keys_above_horizon = Vec::new(); + let mut delta_above_base_image = Vec::new(); + // We have a list of deltas/images. We want to create image layers while collect garbages. + for (key, lsn, val) in accumulated_values.iter().rev() { + if *lsn > horizon { + keys_above_horizon.push((*key, *lsn, val.clone())); // TODO: ensure one LSN corresponds to either delta or image instead of both + } else if *lsn <= horizon { + match val { + crate::repository::Value::Image(image) => { + if lsn <= &horizon { + base_image = Some((*lsn, image.clone())); + break; + } + } + crate::repository::Value::WalRecord(wal) => { + delta_above_base_image.push((*lsn, wal.clone())); + } + } + } + } + delta_above_base_image.reverse(); + keys_above_horizon.reverse(); + let state = ValueReconstructState { + img: base_image, + records: delta_above_base_image, + }; + let img = tline.reconstruct_value(key, horizon, state).await?; + Ok((keys_above_horizon, img)) + } + + let mut delta_layer_writer = DeltaLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + all_key_values.first().unwrap().0, + gc_cutoff..max_lsn, // TODO: off by one? + ctx, + ) + .await?; + let mut image_layer_writer = ImageLayerWriter::new( + self.conf, + self.timeline_id, + self.tenant_shard_id, + &(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()), + gc_cutoff, + ctx, + ) + .await?; + + for item @ (key, _, _) in &all_key_values { + if &last_key == key { + accumulated_values.push(item); + } else { + let (deltas, image) = + flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff) + .await?; + image_layer_writer.put_image(last_key, image, ctx).await?; + for (key, lsn, val) in deltas { + delta_layer_writer.put_value(key, lsn, val, ctx).await?; + } + accumulated_values.clear(); + accumulated_values.push(item); + last_key = *key; + } + } + let (deltas, image) = + flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?; + image_layer_writer.put_image(last_key, image, ctx).await?; + for (key, lsn, val) in deltas { + delta_layer_writer.put_value(key, lsn, val, ctx).await?; + } + accumulated_values.clear(); + // TODO: split layers + let delta_layer = delta_layer_writer.finish(last_key, self, ctx).await?; + let image_layer = image_layer_writer.finish(self, ctx).await?; + // Step 3: Place back to the layer map. + { + let mut guard = self.layers.write().await; + guard.finish_gc_compaction( + &layer_selection, + &[delta_layer.clone(), image_layer.clone()], + &self.metrics, + ) + }; + Ok(()) + } } struct TimelineAdaptor { diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 0e82dedecb..550a9a567a 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -21,6 +21,8 @@ use crate::{ }, }; +use super::TimelineWriterState; + /// Provides semantic APIs to manipulate the layer map. #[derive(Default)] pub(crate) struct LayerManager { @@ -120,18 +122,20 @@ impl LayerManager { Ok(layer) } - /// Called from `freeze_inmem_layer`, returns true if successfully frozen. - pub(crate) async fn try_freeze_in_memory_layer( + /// Tries to freeze an open layer and also manages clearing the TimelineWriterState. + /// + /// Returns true if anything was frozen. + pub(super) async fn try_freeze_in_memory_layer( &mut self, lsn: Lsn, last_freeze_at: &AtomicLsn, - ) { + write_lock: &mut tokio::sync::MutexGuard<'_, Option>, + ) -> bool { let Lsn(last_record_lsn) = lsn; let end_lsn = Lsn(last_record_lsn + 1); - if let Some(open_layer) = &self.layer_map.open_layer { + let froze = if let Some(open_layer) = &self.layer_map.open_layer { let open_layer_rc = Arc::clone(open_layer); - // Does this layer need freezing? open_layer.freeze(end_lsn).await; // The layer is no longer open, update the layer map to reflect this. @@ -139,11 +143,25 @@ impl LayerManager { self.layer_map.frozen_layers.push_back(open_layer_rc); self.layer_map.open_layer = None; self.layer_map.next_open_layer_at = Some(end_lsn); - } + + true + } else { + false + }; // Even if there was no layer to freeze, advance last_freeze_at to last_record_lsn+1: this // accounts for regions in the LSN range where we might have ingested no data due to sharding. last_freeze_at.store(end_lsn); + + // the writer state must no longer have a reference to the frozen layer + let taken = write_lock.take(); + assert_eq!( + froze, + taken.is_some(), + "should only had frozen a layer when TimelineWriterState existed" + ); + + froze } /// Add image layers to the layer map, called from `create_image_layers`. @@ -208,6 +226,18 @@ impl LayerManager { updates.flush(); } + /// Called when a GC-compaction is completed. + #[cfg(test)] + pub(crate) fn finish_gc_compaction( + &mut self, + compact_from: &[Layer], + compact_to: &[ResidentLayer], + metrics: &TimelineMetrics, + ) { + // We can simply reuse compact l0 logic. Use a different function name to indicate a different type of layer map modification. + self.finish_compact_l0(compact_from, compact_to, metrics) + } + /// Called when compaction is completed. pub(crate) fn rewrite_layers( &mut self, diff --git a/patches/pg_anon.patch b/patches/pg_anon.patch new file mode 100644 index 0000000000..15dfd3c5a0 --- /dev/null +++ b/patches/pg_anon.patch @@ -0,0 +1,223 @@ +commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f +Author: Alexey Masterov +Date: Fri May 31 06:34:26 2024 +0000 + + These alternative expected files were added to consider the neon features + +diff --git a/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out +new file mode 100644 +index 0000000..2539cfd +--- /dev/null ++++ b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out +@@ -0,0 +1,101 @@ ++BEGIN; ++CREATE EXTENSION anon CASCADE; ++NOTICE: installing required extension "pgcrypto" ++SELECT anon.init(); ++ init ++------ ++ t ++(1 row) ++ ++CREATE ROLE mallory_the_masked_user; ++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED'; ++CREATE TABLE t1(i INT); ++ALTER TABLE t1 ADD COLUMN t TEXT; ++SECURITY LABEL FOR anon ON COLUMN t1.t ++IS 'MASKED WITH VALUE NULL'; ++INSERT INTO t1 VALUES (1,'test'); ++-- ++-- We're checking the owner's permissions ++-- ++-- see ++-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions ++-- ++SET ROLE mallory_the_masked_user; ++SELECT anon.pseudo_first_name(0) IS NOT NULL; ++ ?column? ++---------- ++ t ++(1 row) ++ ++-- SHOULD FAIL ++DO $$ ++BEGIN ++ PERFORM anon.init(); ++ EXCEPTION WHEN insufficient_privilege ++ THEN RAISE NOTICE 'insufficient_privilege'; ++END$$; ++NOTICE: insufficient_privilege ++-- SHOULD FAIL ++DO $$ ++BEGIN ++ PERFORM anon.anonymize_table('t1'); ++ EXCEPTION WHEN insufficient_privilege ++ THEN RAISE NOTICE 'insufficient_privilege'; ++END$$; ++NOTICE: insufficient_privilege ++-- SHOULD FAIL ++SAVEPOINT fail_start_engine; ++SELECT anon.start_dynamic_masking(); ++ERROR: Only supersusers can start the dynamic masking engine. ++CONTEXT: PL/pgSQL function anon.start_dynamic_masking(boolean) line 18 at RAISE ++ROLLBACK TO fail_start_engine; ++RESET ROLE; ++SELECT anon.start_dynamic_masking(); ++ start_dynamic_masking ++----------------------- ++ t ++(1 row) ++ ++SET ROLE mallory_the_masked_user; ++SELECT * FROM mask.t1; ++ i | t ++---+--- ++ 1 | ++(1 row) ++ ++-- SHOULD FAIL ++DO $$ ++BEGIN ++ SELECT * FROM public.t1; ++ EXCEPTION WHEN insufficient_privilege ++ THEN RAISE NOTICE 'insufficient_privilege'; ++END$$; ++NOTICE: insufficient_privilege ++-- SHOULD FAIL ++SAVEPOINT fail_stop_engine; ++SELECT anon.stop_dynamic_masking(); ++ERROR: Only supersusers can stop the dynamic masking engine. ++CONTEXT: PL/pgSQL function anon.stop_dynamic_masking() line 18 at RAISE ++ROLLBACK TO fail_stop_engine; ++RESET ROLE; ++SELECT anon.stop_dynamic_masking(); ++NOTICE: The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually. ++ stop_dynamic_masking ++---------------------- ++ t ++(1 row) ++ ++SET ROLE mallory_the_masked_user; ++SELECT COUNT(*)=1 FROM anon.pg_masking_rules; ++ ?column? ++---------- ++ t ++(1 row) ++ ++-- SHOULD FAIL ++SAVEPOINT fail_seclabel_on_role; ++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL; ++ERROR: permission denied ++DETAIL: The current user must have the CREATEROLE attribute. ++ROLLBACK TO fail_seclabel_on_role; ++ROLLBACK; +diff --git a/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out +new file mode 100644 +index 0000000..8b090fe +--- /dev/null ++++ b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out +@@ -0,0 +1,104 @@ ++BEGIN; ++CREATE EXTENSION anon CASCADE; ++NOTICE: installing required extension "pgcrypto" ++SELECT anon.init(); ++ init ++------ ++ t ++(1 row) ++ ++CREATE ROLE oscar_the_owner; ++ALTER DATABASE :DBNAME OWNER TO oscar_the_owner; ++CREATE ROLE mallory_the_masked_user; ++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED'; ++-- ++-- We're checking the owner's permissions ++-- ++-- see ++-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions ++-- ++SET ROLE oscar_the_owner; ++SELECT anon.pseudo_first_name(0) IS NOT NULL; ++ ?column? ++---------- ++ t ++(1 row) ++ ++-- SHOULD FAIL ++DO $$ ++BEGIN ++ PERFORM anon.init(); ++ EXCEPTION WHEN insufficient_privilege ++ THEN RAISE NOTICE 'insufficient_privilege'; ++END$$; ++NOTICE: insufficient_privilege ++CREATE TABLE t1(i INT); ++ALTER TABLE t1 ADD COLUMN t TEXT; ++SECURITY LABEL FOR anon ON COLUMN t1.t ++IS 'MASKED WITH VALUE NULL'; ++INSERT INTO t1 VALUES (1,'test'); ++SELECT anon.anonymize_table('t1'); ++ anonymize_table ++----------------- ++ t ++(1 row) ++ ++SELECT * FROM t1; ++ i | t ++---+--- ++ 1 | ++(1 row) ++ ++UPDATE t1 SET t='test' WHERE i=1; ++-- SHOULD FAIL ++SAVEPOINT fail_start_engine; ++SELECT anon.start_dynamic_masking(); ++ start_dynamic_masking ++----------------------- ++ t ++(1 row) ++ ++ROLLBACK TO fail_start_engine; ++RESET ROLE; ++SELECT anon.start_dynamic_masking(); ++ start_dynamic_masking ++----------------------- ++ t ++(1 row) ++ ++SET ROLE oscar_the_owner; ++SELECT * FROM t1; ++ i | t ++---+------ ++ 1 | test ++(1 row) ++ ++--SELECT * FROM mask.t1; ++-- SHOULD FAIL ++SAVEPOINT fail_stop_engine; ++SELECT anon.stop_dynamic_masking(); ++ERROR: permission denied for schema mask ++CONTEXT: SQL statement "DROP VIEW mask.t1;" ++PL/pgSQL function anon.mask_drop_view(oid) line 3 at EXECUTE ++SQL statement "SELECT anon.mask_drop_view(oid) ++ FROM pg_catalog.pg_class ++ WHERE relnamespace=quote_ident(pg_catalog.current_setting('anon.sourceschema'))::REGNAMESPACE ++ AND relkind IN ('r','p','f')" ++PL/pgSQL function anon.stop_dynamic_masking() line 22 at PERFORM ++ROLLBACK TO fail_stop_engine; ++RESET ROLE; ++SELECT anon.stop_dynamic_masking(); ++NOTICE: The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually. ++ stop_dynamic_masking ++---------------------- ++ t ++(1 row) ++ ++SET ROLE oscar_the_owner; ++-- SHOULD FAIL ++SAVEPOINT fail_seclabel_on_role; ++SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL; ++ERROR: permission denied ++DETAIL: The current user must have the CREATEROLE attribute. ++ROLLBACK TO fail_seclabel_on_role; ++ROLLBACK; diff --git a/patches/pg_cron.patch b/patches/pg_cron.patch new file mode 100644 index 0000000000..c2b648c20c --- /dev/null +++ b/patches/pg_cron.patch @@ -0,0 +1,19 @@ +commit b3ea51ee158f113f2f82d0b97c12c54343c9a695 (HEAD -> master) +Author: Alexey Masterov +Date: Fri Jun 7 19:23:42 2024 +0000 + + Disable REGRESS_OPTIONS causing initdb + +diff --git a/ext-src/pg_cron-src/Makefile b/ext-src/pg_cron-src/Makefile +index 053314c..fbd5fb5 100644 +--- a/ext-src/pg_cron-src/Makefile ++++ b/ext-src/pg_cron-src/Makefile +@@ -5,7 +5,7 @@ EXTENSION = pg_cron + DATA_built = $(EXTENSION)--1.0.sql + DATA = $(wildcard $(EXTENSION)--*--*.sql) + +-REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check ++#REGRESS_OPTS =--temp-config=./pg_cron.conf --temp-instance=./tmp_check + REGRESS = pg_cron-test + + # compilation configuration diff --git a/patches/pg_hintplan.patch b/patches/pg_hintplan.patch new file mode 100644 index 0000000000..61a5ecbb90 --- /dev/null +++ b/patches/pg_hintplan.patch @@ -0,0 +1,39 @@ +commit f7925d4d1406c0f0229e3c691c94b69e381899b1 (HEAD -> master) +Author: Alexey Masterov +Date: Thu Jun 6 08:02:42 2024 +0000 + + Patch expected files to consider Neon's log messages + +diff --git a/ext-src/pg_hint_plan-src/expected/ut-A.out b/ext-src/pg_hint_plan-src/expected/ut-A.out +index da723b8..f8d0102 100644 +--- a/ext-src/pg_hint_plan-src/expected/ut-A.out ++++ b/ext-src/pg_hint_plan-src/expected/ut-A.out +@@ -9,13 +9,16 @@ SET search_path TO public; + ---- + -- No.A-1-1-3 + CREATE EXTENSION pg_hint_plan; ++LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan + -- No.A-1-2-3 + DROP EXTENSION pg_hint_plan; + -- No.A-1-1-4 + CREATE SCHEMA other_schema; + CREATE EXTENSION pg_hint_plan SCHEMA other_schema; ++LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan + ERROR: extension "pg_hint_plan" must be installed in schema "hint_plan" + CREATE EXTENSION pg_hint_plan; ++LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan + DROP SCHEMA other_schema; + ---- + ---- No. A-5-1 comment pattern +diff --git a/ext-src/pg_hint_plan-src/expected/ut-fdw.out b/ext-src/pg_hint_plan-src/expected/ut-fdw.out +index d372459..6282afe 100644 +--- a/ext-src/pg_hint_plan-src/expected/ut-fdw.out ++++ b/ext-src/pg_hint_plan-src/expected/ut-fdw.out +@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on; + SET client_min_messages TO LOG; + SET pg_hint_plan.enable_hint TO on; + CREATE EXTENSION file_fdw; ++LOG: Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw + CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw; + CREATE USER MAPPING FOR PUBLIC SERVER file_server; + CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename'); diff --git a/pgxn/.dir-locals.el b/pgxn/.dir-locals.el new file mode 100644 index 0000000000..ab6208b698 --- /dev/null +++ b/pgxn/.dir-locals.el @@ -0,0 +1,19 @@ +;; see also src/tools/editors/emacs.samples for more complete settings + +((c-mode . ((c-basic-offset . 4) + (c-file-style . "bsd") + (fill-column . 78) + (indent-tabs-mode . t) + (tab-width . 4))) + (nxml-mode . ((fill-column . 78) + (indent-tabs-mode . nil))) + (perl-mode . ((perl-indent-level . 4) + (perl-continued-statement-offset . 2) + (perl-continued-brace-offset . -2) + (perl-brace-offset . 0) + (perl-brace-imaginary-offset . 0) + (perl-label-offset . -2) + (indent-tabs-mode . t) + (tab-width . 4))) + (sgml-mode . ((fill-column . 78) + (indent-tabs-mode . nil)))) diff --git a/pgxn/.editorconfig b/pgxn/.editorconfig new file mode 100644 index 0000000000..d69a3d1dc4 --- /dev/null +++ b/pgxn/.editorconfig @@ -0,0 +1,14 @@ +root = true + +[*.{c,h,l,y,pl,pm}] +indent_style = tab +indent_size = tab +tab_width = 4 + +[*.{sgml,xml}] +indent_style = space +indent_size = 1 + +[*.xsl] +indent_style = space +indent_size = 2 diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index b69a3819c9..276d1542fe 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -19,6 +19,7 @@ #include "catalog/pg_type.h" #include "postmaster/bgworker.h" #include "postmaster/interrupt.h" +#include "replication/logical.h" #include "replication/slot.h" #include "replication/walsender.h" #include "storage/procsignal.h" @@ -280,6 +281,7 @@ _PG_init(void) pg_init_libpagestore(); pg_init_walproposer(); WalSender_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; + LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines; InitLogicalReplicationMonitor(); diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c index 93dce9de84..8f8d1dfc01 100644 --- a/pgxn/neon/walsender_hooks.c +++ b/pgxn/neon/walsender_hooks.c @@ -24,8 +24,12 @@ #include "walproposer.h" static NeonWALReader *wal_reader = NULL; + +struct WalSnd; +extern struct WalSnd *MyWalSnd; extern XLogRecPtr WalSndWaitForWal(XLogRecPtr loc); extern bool GetDonorShmem(XLogRecPtr *donor_lsn); +extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI); static XLogRecPtr NeonWALReadWaitForWAL(XLogRecPtr loc) @@ -36,7 +40,28 @@ NeonWALReadWaitForWAL(XLogRecPtr loc) CHECK_FOR_INTERRUPTS(); } - return WalSndWaitForWal(loc); + // Walsender sends keepalives and stuff, so better use its normal wait + if (MyWalSnd != NULL) + return WalSndWaitForWal(loc); + + for (;;) + { + XLogRecPtr flush_ptr; + if (!RecoveryInProgress()) +#if PG_VERSION_NUM >= 150000 + flush_ptr = GetFlushRecPtr(NULL); +#else + flush_ptr = GetFlushRecPtr(); +#endif + else + flush_ptr = GetXLogReplayRecPtr(NULL); + + if (loc <= flush_ptr) + return flush_ptr; + + CHECK_FOR_INTERRUPTS(); + pg_usleep(1000); + } } static int diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs index 3ff0d87988..4ab85509dc 100644 --- a/storage_controller/src/scheduler.rs +++ b/storage_controller/src/scheduler.rs @@ -29,6 +29,8 @@ pub enum MaySchedule { struct SchedulerNode { /// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`]. shard_count: usize, + /// How many shards are currently attached on this node, via their [`crate::tenant_shard::IntentState`]. + attached_shard_count: usize, /// Whether this node is currently elegible to have new shards scheduled (this is derived /// from a node's availability state and scheduling policy). @@ -42,7 +44,9 @@ impl PartialEq for SchedulerNode { (MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No) ); - may_schedule_matches && self.shard_count == other.shard_count + may_schedule_matches + && self.shard_count == other.shard_count + && self.attached_shard_count == other.attached_shard_count } } @@ -138,6 +142,15 @@ impl ScheduleContext { } } +pub(crate) enum RefCountUpdate { + PromoteSecondary, + Attach, + Detach, + DemoteAttached, + AddSecondary, + RemoveSecondary, +} + impl Scheduler { pub(crate) fn new<'a>(nodes: impl Iterator) -> Self { let mut scheduler_nodes = HashMap::new(); @@ -146,6 +159,7 @@ impl Scheduler { node.get_id(), SchedulerNode { shard_count: 0, + attached_shard_count: 0, may_schedule: node.may_schedule(), }, ); @@ -171,6 +185,7 @@ impl Scheduler { node.get_id(), SchedulerNode { shard_count: 0, + attached_shard_count: 0, may_schedule: node.may_schedule(), }, ); @@ -179,7 +194,10 @@ impl Scheduler { for shard in shards { if let Some(node_id) = shard.intent.get_attached() { match expect_nodes.get_mut(node_id) { - Some(node) => node.shard_count += 1, + Some(node) => { + node.shard_count += 1; + node.attached_shard_count += 1; + } None => anyhow::bail!( "Tenant {} references nonexistent node {}", shard.tenant_shard_id, @@ -227,31 +245,42 @@ impl Scheduler { Ok(()) } - /// Increment the reference count of a node. This reference count is used to guide scheduling - /// decisions, not for memory management: it represents one tenant shard whose IntentState targets - /// this node. + /// Update the reference counts of a node. These reference counts are used to guide scheduling + /// decisions, not for memory management: they represent the number of tenant shard whose IntentState + /// targets this node and the number of tenants shars whose IntentState is attached to this + /// node. /// /// It is an error to call this for a node that is not known to the scheduler (i.e. passed into /// [`Self::new`] or [`Self::node_upsert`]) - pub(crate) fn node_inc_ref(&mut self, node_id: NodeId) { - let Some(node) = self.nodes.get_mut(&node_id) else { - tracing::error!("Scheduler missing node {node_id}"); - debug_assert!(false); - return; - }; - - node.shard_count += 1; - } - - /// Decrement a node's reference count. Inverse of [`Self::node_inc_ref`]. - pub(crate) fn node_dec_ref(&mut self, node_id: NodeId) { + pub(crate) fn update_node_ref_counts(&mut self, node_id: NodeId, update: RefCountUpdate) { let Some(node) = self.nodes.get_mut(&node_id) else { debug_assert!(false); tracing::error!("Scheduler missing node {node_id}"); return; }; - node.shard_count -= 1; + match update { + RefCountUpdate::PromoteSecondary => { + node.attached_shard_count += 1; + } + RefCountUpdate::Attach => { + node.shard_count += 1; + node.attached_shard_count += 1; + } + RefCountUpdate::Detach => { + node.shard_count -= 1; + node.attached_shard_count -= 1; + } + RefCountUpdate::DemoteAttached => { + node.attached_shard_count -= 1; + } + RefCountUpdate::AddSecondary => { + node.shard_count += 1; + } + RefCountUpdate::RemoveSecondary => { + node.shard_count -= 1; + } + } } pub(crate) fn node_upsert(&mut self, node: &Node) { @@ -263,6 +292,7 @@ impl Scheduler { Vacant(entry) => { entry.insert(SchedulerNode { shard_count: 0, + attached_shard_count: 0, may_schedule: node.may_schedule(), }); } @@ -385,6 +415,11 @@ impl Scheduler { pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize { self.nodes.get(&node_id).unwrap().shard_count } + + #[cfg(test)] + pub(crate) fn get_node_attached_shard_count(&self, node_id: NodeId) -> usize { + self.nodes.get(&node_id).unwrap().attached_shard_count + } } #[cfg(test)] @@ -437,18 +472,28 @@ mod tests { let scheduled = scheduler.schedule_shard(&[], &context)?; t2_intent.set_attached(&mut scheduler, Some(scheduled)); - assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1); - assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1); + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1); + + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1); let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?; t1_intent.push_secondary(&mut scheduler, scheduled); - assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1); - assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 2); + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 1); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1); + + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1); t1_intent.clear(&mut scheduler); - assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0); - assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1); + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0); + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 1); + + let total_attached = scheduler.get_node_attached_shard_count(NodeId(1)) + + scheduler.get_node_attached_shard_count(NodeId(2)); + assert_eq!(total_attached, 1); if cfg!(debug_assertions) { // Dropping an IntentState without clearing it causes a panic in debug mode, @@ -459,8 +504,12 @@ mod tests { assert!(result.is_err()); } else { t2_intent.clear(&mut scheduler); - assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 0); - assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 0); + + assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 0); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 0); + + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 0); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0); } Ok(()) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 756dc10a2a..1e81b5c5a2 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -4312,7 +4312,7 @@ impl Service { continue; } - if tenant_shard.intent.demote_attached(node_id) { + if tenant_shard.intent.demote_attached(scheduler, node_id) { tenant_shard.sequence = tenant_shard.sequence.next(); // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index dda17f9887..77bbf4c604 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -8,7 +8,7 @@ use crate::{ metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome}, persistence::TenantShardPersistence, reconciler::ReconcileUnits, - scheduler::{AffinityScore, MaySchedule, ScheduleContext}, + scheduler::{AffinityScore, MaySchedule, RefCountUpdate, ScheduleContext}, }; use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy}; use pageserver_api::{ @@ -153,7 +153,7 @@ impl IntentState { } pub(crate) fn single(scheduler: &mut Scheduler, node_id: Option) -> Self { if let Some(node_id) = node_id { - scheduler.node_inc_ref(node_id); + scheduler.update_node_ref_counts(node_id, RefCountUpdate::Attach); } Self { attached: node_id, @@ -164,10 +164,10 @@ impl IntentState { pub(crate) fn set_attached(&mut self, scheduler: &mut Scheduler, new_attached: Option) { if self.attached != new_attached { if let Some(old_attached) = self.attached.take() { - scheduler.node_dec_ref(old_attached); + scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach); } if let Some(new_attached) = &new_attached { - scheduler.node_inc_ref(*new_attached); + scheduler.update_node_ref_counts(*new_attached, RefCountUpdate::Attach); } self.attached = new_attached; } @@ -177,22 +177,27 @@ impl IntentState { /// secondary to attached while maintaining the scheduler's reference counts. pub(crate) fn promote_attached( &mut self, - _scheduler: &mut Scheduler, + scheduler: &mut Scheduler, promote_secondary: NodeId, ) { // If we call this with a node that isn't in secondary, it would cause incorrect // scheduler reference counting, since we assume the node is already referenced as a secondary. debug_assert!(self.secondary.contains(&promote_secondary)); - // TODO: when scheduler starts tracking attached + secondary counts separately, we will - // need to call into it here. self.secondary.retain(|n| n != &promote_secondary); + + let demoted = self.attached; self.attached = Some(promote_secondary); + + scheduler.update_node_ref_counts(promote_secondary, RefCountUpdate::PromoteSecondary); + if let Some(demoted) = demoted { + scheduler.update_node_ref_counts(demoted, RefCountUpdate::DemoteAttached); + } } pub(crate) fn push_secondary(&mut self, scheduler: &mut Scheduler, new_secondary: NodeId) { debug_assert!(!self.secondary.contains(&new_secondary)); - scheduler.node_inc_ref(new_secondary); + scheduler.update_node_ref_counts(new_secondary, RefCountUpdate::AddSecondary); self.secondary.push(new_secondary); } @@ -200,27 +205,27 @@ impl IntentState { pub(crate) fn remove_secondary(&mut self, scheduler: &mut Scheduler, node_id: NodeId) { let index = self.secondary.iter().position(|n| *n == node_id); if let Some(index) = index { - scheduler.node_dec_ref(node_id); + scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary); self.secondary.remove(index); } } pub(crate) fn clear_secondary(&mut self, scheduler: &mut Scheduler) { for secondary in self.secondary.drain(..) { - scheduler.node_dec_ref(secondary); + scheduler.update_node_ref_counts(secondary, RefCountUpdate::RemoveSecondary); } } /// Remove the last secondary node from the list of secondaries pub(crate) fn pop_secondary(&mut self, scheduler: &mut Scheduler) { if let Some(node_id) = self.secondary.pop() { - scheduler.node_dec_ref(node_id); + scheduler.update_node_ref_counts(node_id, RefCountUpdate::RemoveSecondary); } } pub(crate) fn clear(&mut self, scheduler: &mut Scheduler) { if let Some(old_attached) = self.attached.take() { - scheduler.node_dec_ref(old_attached); + scheduler.update_node_ref_counts(old_attached, RefCountUpdate::Detach); } self.clear_secondary(scheduler); @@ -251,12 +256,11 @@ impl IntentState { /// forget the location on the offline node. /// /// Returns true if a change was made - pub(crate) fn demote_attached(&mut self, node_id: NodeId) -> bool { + pub(crate) fn demote_attached(&mut self, scheduler: &mut Scheduler, node_id: NodeId) -> bool { if self.attached == Some(node_id) { - // TODO: when scheduler starts tracking attached + secondary counts separately, we will - // need to call into it here. self.attached = None; self.secondary.push(node_id); + scheduler.update_node_ref_counts(node_id, RefCountUpdate::DemoteAttached); true } else { false @@ -593,7 +597,7 @@ impl TenantShard { Secondary => { if let Some(node_id) = self.intent.get_attached() { // Populate secondary by demoting the attached node - self.intent.demote_attached(*node_id); + self.intent.demote_attached(scheduler, *node_id); modified = true; } else if self.intent.secondary.is_empty() { // Populate secondary by scheduling a fresh node @@ -783,7 +787,7 @@ impl TenantShard { old_attached_node_id, new_attached_node_id, }) => { - self.intent.demote_attached(old_attached_node_id); + self.intent.demote_attached(scheduler, old_attached_node_id); self.intent .promote_attached(scheduler, new_attached_node_id); } @@ -1321,7 +1325,9 @@ pub(crate) mod tests { assert_ne!(attached_node_id, secondary_node_id); // Notifying the attached node is offline should demote it to a secondary - let changed = tenant_shard.intent.demote_attached(attached_node_id); + let changed = tenant_shard + .intent + .demote_attached(&mut scheduler, attached_node_id); assert!(changed); assert!(tenant_shard.intent.attached.is_none()); assert_eq!(tenant_shard.intent.secondary.len(), 2); @@ -1604,7 +1610,14 @@ pub(crate) mod tests { // We should see equal number of locations on the two nodes. assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4); + // Scheduling does not consider the number of attachments picking the initial + // pageserver to attach to (hence the assertion that all primaries are on the + // same node) + // TODO: Tweak the scheduling to evenly distribute attachments for new shards. + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 4); + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 0); // Add another two nodes: we should see the shards spread out when their optimize // methods are called @@ -1613,9 +1626,16 @@ pub(crate) mod tests { optimize_til_idle(&nodes, &mut scheduler, &mut shards); assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(1)), 1); + assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(2)), 1); + assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(3)), 1); + assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2); + assert_eq!(scheduler.get_node_attached_shard_count(NodeId(4)), 1); for shard in shards.iter_mut() { shard.intent.clear(&mut scheduler); diff --git a/s3_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml similarity index 98% rename from s3_scrubber/Cargo.toml rename to storage_scrubber/Cargo.toml index 48b50ca21c..050be66483 100644 --- a/s3_scrubber/Cargo.toml +++ b/storage_scrubber/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "s3_scrubber" +name = "storage_scrubber" version = "0.1.0" edition.workspace = true license.workspace = true diff --git a/s3_scrubber/README.md b/storage_scrubber/README.md similarity index 99% rename from s3_scrubber/README.md rename to storage_scrubber/README.md index 8a96542ada..0930f343ec 100644 --- a/s3_scrubber/README.md +++ b/storage_scrubber/README.md @@ -1,4 +1,4 @@ -# Neon S3 scrubber +# Neon Storage Scrubber This tool directly accesses the S3 buckets used by the Neon `pageserver` and `safekeeper`, and does housekeeping such as cleaning up objects for tenants & timelines that no longer exist. diff --git a/s3_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs similarity index 97% rename from s3_scrubber/src/checks.rs rename to storage_scrubber/src/checks.rs index 44fb53696c..4eb8580e32 100644 --- a/s3_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -78,17 +78,16 @@ pub(crate) fn branch_cleanup_and_check_errors( index_part_generation: _index_part_generation, s3_layers: _s3_layers, } => { - if !IndexPart::KNOWN_VERSIONS.contains(&index_part.get_version()) { - result.errors.push(format!( - "index_part.json version: {}", - index_part.get_version() - )) + if !IndexPart::KNOWN_VERSIONS.contains(&index_part.version()) { + result + .errors + .push(format!("index_part.json version: {}", index_part.version())) } - if &index_part.get_version() != IndexPart::KNOWN_VERSIONS.last().unwrap() { + if &index_part.version() != IndexPart::KNOWN_VERSIONS.last().unwrap() { result.warnings.push(format!( "index_part.json version is not latest: {}", - index_part.get_version() + index_part.version() )) } diff --git a/s3_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs similarity index 100% rename from s3_scrubber/src/cloud_admin_api.rs rename to storage_scrubber/src/cloud_admin_api.rs diff --git a/s3_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs similarity index 100% rename from s3_scrubber/src/garbage.rs rename to storage_scrubber/src/garbage.rs diff --git a/s3_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs similarity index 100% rename from s3_scrubber/src/lib.rs rename to storage_scrubber/src/lib.rs diff --git a/s3_scrubber/src/main.rs b/storage_scrubber/src/main.rs similarity index 96% rename from s3_scrubber/src/main.rs rename to storage_scrubber/src/main.rs index ade8ef7d7a..222bd10ed2 100644 --- a/s3_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -1,11 +1,11 @@ use anyhow::bail; use camino::Utf8PathBuf; use pageserver_api::shard::TenantShardId; -use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; -use s3_scrubber::pageserver_physical_gc::GcMode; -use s3_scrubber::scan_pageserver_metadata::scan_metadata; -use s3_scrubber::tenant_snapshot::SnapshotDownloader; -use s3_scrubber::{ +use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; +use storage_scrubber::pageserver_physical_gc::GcMode; +use storage_scrubber::scan_pageserver_metadata::scan_metadata; +use storage_scrubber::tenant_snapshot::SnapshotDownloader; +use storage_scrubber::{ init_logging, pageserver_physical_gc::pageserver_physical_gc, scan_safekeeper_metadata::scan_safekeeper_metadata, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth, diff --git a/s3_scrubber/src/metadata_stream.rs b/storage_scrubber/src/metadata_stream.rs similarity index 100% rename from s3_scrubber/src/metadata_stream.rs rename to storage_scrubber/src/metadata_stream.rs diff --git a/s3_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs similarity index 100% rename from s3_scrubber/src/pageserver_physical_gc.rs rename to storage_scrubber/src/pageserver_physical_gc.rs diff --git a/s3_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs similarity index 99% rename from s3_scrubber/src/scan_pageserver_metadata.rs rename to storage_scrubber/src/scan_pageserver_metadata.rs index 6ff9783875..af74ffa4cd 100644 --- a/s3_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -125,7 +125,7 @@ impl MetadataSummary { { *self .indices_by_version - .entry(index_part.get_version()) + .entry(index_part.version()) .or_insert(0) += 1; if let Err(e) = self.update_histograms(index_part) { diff --git a/s3_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs similarity index 100% rename from s3_scrubber/src/scan_safekeeper_metadata.rs rename to storage_scrubber/src/scan_safekeeper_metadata.rs diff --git a/s3_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs similarity index 100% rename from s3_scrubber/src/tenant_snapshot.rs rename to storage_scrubber/src/tenant_snapshot.rs diff --git a/test_runner/README.md b/test_runner/README.md index fd68cfff79..7d95634ea8 100644 --- a/test_runner/README.md +++ b/test_runner/README.md @@ -285,6 +285,21 @@ def test_foobar(neon_env_builder: NeonEnvBuilder): ... ``` +The env includes a default tenant and timeline. Therefore, you do not need to create your own +tenant/timeline for testing. + +```python +def test_foobar2(neon_env_builder: NeonEnvBuilder): + env = neon_env_builder.init_start() # Start the environment + with env.endpoints.create_start("main") as endpoint: + # Start the compute endpoint + client = env.pageserver.http_client() # Get the pageserver client + + tenant_id = env.initial_tenant + timeline_id = env.initial_timeline + client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id) +``` + For more information about pytest fixtures, see https://docs.pytest.org/en/stable/fixture.html At the end of a test, all the nodes in the environment are automatically stopped, so you diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index a70f4228e4..469937c558 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -833,7 +833,7 @@ class NeonEnvBuilder: def enable_scrub_on_exit(self): """ Call this if you would like the fixture to automatically run - s3_scrubber at the end of the test, as a bidirectional test + storage_scrubber at the end of the test, as a bidirectional test that the scrubber is working properly, and that the code within the test didn't produce any invalid remote state. """ @@ -948,7 +948,7 @@ class NeonEnvBuilder: if self.scrub_on_exit: try: - S3Scrubber(self).scan_metadata() + StorageScrubber(self).scan_metadata() except Exception as e: log.error(f"Error during remote storage scrub: {e}") cleanup_error = e @@ -3391,7 +3391,7 @@ def static_proxy( yield proxy -class Endpoint(PgProtocol): +class Endpoint(PgProtocol, LogUtils): """An object representing a Postgres compute endpoint managed by the control plane.""" def __init__( @@ -3457,6 +3457,7 @@ class Endpoint(PgProtocol): ) path = Path("endpoints") / self.endpoint_id / "pgdata" self.pgdata_dir = os.path.join(self.env.repo_dir, path) + self.logfile = self.endpoint_path() / "compute.log" config_lines = config_lines or [] @@ -3941,7 +3942,7 @@ class Safekeeper(LogUtils): wait_until(20, 0.5, paused) -class S3Scrubber: +class StorageScrubber: def __init__(self, env: NeonEnvBuilder, log_dir: Optional[Path] = None): self.env = env self.log_dir = log_dir or env.test_output_dir @@ -3961,7 +3962,7 @@ class S3Scrubber: if s3_storage.endpoint is not None: env.update({"AWS_ENDPOINT_URL": s3_storage.endpoint}) - base_args = [str(self.env.neon_binpath / "s3_scrubber")] + base_args = [str(self.env.neon_binpath / "storage_scrubber")] args = base_args + args (output_path, stdout, status_code) = subprocess_capture( diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 91435e8a1f..72384c138b 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -313,7 +313,7 @@ def assert_prefix_empty( # https://neon-github-public-dev.s3.amazonaws.com/reports/pr-5322/6207777020/index.html#suites/3556ed71f2d69272a7014df6dcb02317/53b5c368b5a68865 # this seems like a mock_s3 issue log.warning( - f"contrading ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0" + f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}, assuming this means KeyCount=0" ) keys = 0 elif keys != 0 and len(objects) == 0: diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py index d06d2f3d93..7f54eb0b0a 100644 --- a/test_runner/fixtures/utils.py +++ b/test_runner/fixtures/utils.py @@ -591,3 +591,20 @@ class PropagatingThread(threading.Thread): if self.exc: raise self.exc return self.ret + + +def human_bytes(amt: float) -> str: + """ + Render a bytes amount into nice IEC bytes string. + """ + + suffixes = ["", "Ki", "Mi", "Gi"] + + last = suffixes[-1] + + for name in suffixes: + if amt < 1024 or name == last: + return f"{int(round(amt))} {name}B" + amt = amt / 1024 + + raise RuntimeError("unreachable") diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 7ae2352c06..7722828c79 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -17,7 +17,7 @@ from fixtures.neon_fixtures import ( from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_upload_queue_empty from fixtures.remote_storage import RemoteStorageKind -from fixtures.utils import wait_until +from fixtures.utils import human_bytes, wait_until GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy" @@ -218,19 +218,6 @@ def count_layers_per_tenant( return dict(ret) -def human_bytes(amt: float) -> str: - suffixes = ["", "Ki", "Mi", "Gi"] - - last = suffixes[-1] - - for name in suffixes: - if amt < 1024 or name == last: - return f"{int(round(amt))} {name}B" - amt = amt / 1024 - - raise RuntimeError("unreachable") - - def _eviction_env( request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, num_pageservers: int ) -> EvictionEnv: @@ -294,7 +281,7 @@ def pgbench_init_tenant( "gc_period": "0s", "compaction_period": "0s", "checkpoint_distance": f"{layer_size}", - "image_creation_threshold": "100", + "image_creation_threshold": "999999", "compaction_target_size": f"{layer_size}", } ) @@ -668,11 +655,10 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or finish_tenant_creation(env, tenant_id, timeline_id, min_expected_layers) tenant_layers = count_layers_per_tenant(env.pageserver, map(lambda x: x[0], timelines)) - (total_on_disk, _, _) = poor_mans_du(env, map(lambda x: x[0], timelines), env.pageserver, False) + (total_on_disk, _, _) = poor_mans_du(env, map(lambda x: x[0], timelines), env.pageserver, True) - # cut 10 percent response = env.pageserver.http_client().disk_usage_eviction_run( - {"evict_bytes": total_on_disk // 10, "eviction_order": order.config()} + {"evict_bytes": total_on_disk // 5, "eviction_order": order.config()} ) log.info(f"{response}") diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py index cf7a1c56ee..1d1b2fb485 100644 --- a/test_runner/regress/test_hot_standby.py +++ b/test_runner/regress/test_hot_standby.py @@ -1,6 +1,5 @@ import asyncio import os -import re import threading import time from functools import partial @@ -18,20 +17,6 @@ from fixtures.neon_fixtures import ( from fixtures.utils import wait_until -# Check for corrupted WAL messages which might otherwise go unnoticed if -# reconnection fixes this. -def scan_standby_log_for_errors(secondary): - log_path = secondary.endpoint_path() / "compute.log" - with log_path.open("r") as f: - markers = re.compile( - r"incorrect resource manager data|record with incorrect|invalid magic number|unexpected pageaddr" - ) - for line in f: - if markers.search(line): - log.info(f"bad error in standby log: {line}") - raise AssertionError() - - def test_hot_standby(neon_simple_env: NeonEnv): env = neon_simple_env @@ -91,7 +76,11 @@ def test_hot_standby(neon_simple_env: NeonEnv): assert response is not None assert response == responses[query] - scan_standby_log_for_errors(secondary) + # Check for corrupted WAL messages which might otherwise go unnoticed if + # reconnection fixes this. + assert not secondary.log_contains( + "incorrect resource manager data|record with incorrect|invalid magic number|unexpected pageaddr" + ) # clean up if slow_down_send: diff --git a/test_runner/regress/test_ingestion_layer_size.py b/test_runner/regress/test_ingestion_layer_size.py new file mode 100644 index 0000000000..44c77b3410 --- /dev/null +++ b/test_runner/regress/test_ingestion_layer_size.py @@ -0,0 +1,151 @@ +from dataclasses import dataclass +from typing import Iterable, List, Union + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn +from fixtures.pageserver.http import HistoricLayerInfo, LayerMapInfo +from fixtures.utils import human_bytes + + +def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder, build_type: str): + """ + Build a non-small GIN index which includes similarly batched up images in WAL stream as does pgvector + to show that we no longer create oversized layers. + """ + + if build_type == "debug": + pytest.skip("debug run is unnecessarily slow") + + minimum_initdb_size = 20 * 1024**2 + checkpoint_distance = 32 * 1024**2 + minimum_good_layer_size = checkpoint_distance * 0.9 + minimum_too_large_layer_size = 2 * checkpoint_distance + + # index size: 99MiB + rows = 2_500_000 + + # bucket lower limits + buckets = [0, minimum_initdb_size, minimum_good_layer_size, minimum_too_large_layer_size] + + assert ( + minimum_initdb_size < minimum_good_layer_size + ), "keep checkpoint_distance higher than the initdb size (find it by experimenting)" + + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_distance": f"{checkpoint_distance}", + "compaction_target_size": f"{checkpoint_distance}", + # this test is primarly interested in L0 sizes but we'll compact after ingestion to ensure sizes are good even then + "compaction_period": "0s", + "gc_period": "0s", + "compaction_threshold": "255", + "image_creation_threshold": "99999", + } + ) + + # build a larger than 3*checkpoint_distance sized gin index. + # gin index building exhibits the same behaviour as the pgvector with the two phase build + with env.endpoints.create_start("main") as ep, ep.cursor() as cur: + cur.execute( + f"create table int_array_test as select array_agg(g) as int_array from generate_series(1, {rows}) g group by g / 10;" + ) + cur.execute( + "create index int_array_test_gin_index on int_array_test using gin (int_array);" + ) + cur.execute("select pg_table_size('int_array_test_gin_index')") + size = cur.fetchone() + assert size is not None + assert isinstance(size[0], int) + log.info(f"gin index size: {human_bytes(size[0])}") + assert ( + size[0] > checkpoint_distance * 3 + ), f"gin index is not large enough: {human_bytes(size[0])}" + wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + ps_http = env.pageserver.http_client() + ps_http.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + infos = ps_http.layer_map_info(env.initial_tenant, env.initial_timeline) + assert len(infos.in_memory_layers) == 0, "should had flushed open layers" + post_ingest = histogram_historic_layers(infos, buckets) + + # describe first, assert later for easier debugging + log.info("non-cumulative layer size distribution after ingestion:") + print_layer_size_histogram(post_ingest) + + # since all we have are L0s, we should be getting nice L1s and images out of them now + ps_http.patch_tenant_config_client_side( + env.initial_tenant, + { + "compaction_threshold": 1, + "image_creation_threshold": 1, + }, + ) + + ps_http.timeline_compact(env.initial_tenant, env.initial_timeline, True, True) + + infos = ps_http.layer_map_info(env.initial_tenant, env.initial_timeline) + assert len(infos.in_memory_layers) == 0, "no new inmem layers expected" + post_compact = histogram_historic_layers(infos, buckets) + + log.info("non-cumulative layer size distribution after compaction:") + print_layer_size_histogram(post_compact) + + assert ( + post_ingest.counts[3] == 0 + ), f"there should be no layers larger than 2*checkpoint_distance ({human_bytes(2*checkpoint_distance)})" + assert post_ingest.counts[1] == 1, "expect one smaller layer for initdb" + assert ( + post_ingest.counts[0] <= 1 + ), "expect at most one tiny layer from shutting down the endpoint" + + # just make sure we don't have trouble splitting the layers apart + assert post_compact.counts[3] == 0 + + +@dataclass +class Histogram: + buckets: List[Union[int, float]] + counts: List[int] + sums: List[int] + + +def histogram_historic_layers( + infos: LayerMapInfo, minimum_sizes: List[Union[int, float]] +) -> Histogram: + def log_layer(layer: HistoricLayerInfo) -> HistoricLayerInfo: + log.info( + f"{layer.layer_file_name} {human_bytes(layer.layer_file_size)} ({layer.layer_file_size} bytes)" + ) + return layer + + layers = map(log_layer, infos.historic_layers) + sizes = (x.layer_file_size for x in layers) + return histogram(sizes, minimum_sizes) + + +def histogram(sizes: Iterable[int], minimum_sizes: List[Union[int, float]]) -> Histogram: + assert all(minimum_sizes[i] < minimum_sizes[i + 1] for i in range(len(minimum_sizes) - 1)) + buckets = list(enumerate(minimum_sizes)) + counts = [0 for _ in buckets] + sums = [0 for _ in buckets] + + for size in sizes: + found = False + for index, min_size in reversed(buckets): + if size >= min_size: + counts[index] += 1 + sums[index] += size + found = True + break + assert found + + return Histogram(minimum_sizes, counts, sums) + + +def print_layer_size_histogram(h: Histogram): + for index, min_size in enumerate(h.buckets): + log.info( + f">= {human_bytes(min_size)}: {h.counts[index]} layers total {human_bytes(h.sums[index])}" + ) diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 18e5111786..54d3b2d515 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -37,7 +37,8 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): """ neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start() + env = neon_env_builder.init_configs() + env.start() env.pageserver.allowed_errors.extend( [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] ) diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index a657d5a035..ca3c81d6e5 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -221,6 +221,35 @@ def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg): wait_until(number_of_iterations=10, interval=2, func=partial(slot_removed, endpoint)) +def test_ondemand_wal_download_in_replication_slot_funcs(neon_env_builder: NeonEnvBuilder): + neon_env_builder.num_safekeepers = 3 + env = neon_env_builder.init_start() + + env.neon_cli.create_branch("init") + endpoint = env.endpoints.create_start("init") + + with endpoint.connect().cursor() as cur: + cur.execute("create table wal_generator (id serial primary key, data text)") + cur.execute( + "SELECT * FROM pg_create_logical_replication_slot('slotty_mcslotface', 'test_decoding')" + ) + cur.execute( + """ +INSERT INTO wal_generator (data) +SELECT repeat('A', 1024) -- Generates a kilobyte of data per row +FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of data +""" + ) + + endpoint.stop_and_destroy() + endpoint = env.endpoints.create_start("init") + + with endpoint.connect().cursor() as cur: + cur.execute( + "SELECT * FROM pg_logical_slot_peek_binary_changes('slotty_mcslotface', NULL, NULL, 'include-xids', '0')" + ) + + # Tests that walsender correctly blocks until WAL is downloaded from safekeepers def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg): neon_env_builder.num_safekeepers = 3 @@ -247,6 +276,7 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of connstr = endpoint.connstr().replace("'", "''") vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub") logical_replication_sync(vanilla_pg, endpoint) + vanilla_pg.stop() # Pause the safekeepers so that they can't send WAL (except to pageserver) diff --git a/test_runner/regress/test_migrations.py b/test_runner/regress/test_migrations.py index 526ae14b87..5637f160cf 100644 --- a/test_runner/regress/test_migrations.py +++ b/test_runner/regress/test_migrations.py @@ -8,8 +8,6 @@ def test_migrations(neon_simple_env: NeonEnv): env.neon_cli.create_branch("test_migrations", "empty") endpoint = env.endpoints.create("test_migrations") - log_path = endpoint.endpoint_path() / "compute.log" - endpoint.respec(skip_pg_catalog_updates=False) endpoint.start() @@ -22,9 +20,7 @@ def test_migrations(neon_simple_env: NeonEnv): migration_id = cur.fetchall() assert migration_id[0][0] == num_migrations - with open(log_path, "r") as log_file: - logs = log_file.read() - assert f"INFO handle_migrations: Ran {num_migrations} migrations" in logs + endpoint.assert_log_contains(f"INFO handle_migrations: Ran {num_migrations} migrations") endpoint.stop() endpoint.start() @@ -36,6 +32,4 @@ def test_migrations(neon_simple_env: NeonEnv): migration_id = cur.fetchall() assert migration_id[0][0] == num_migrations - with open(log_path, "r") as log_file: - logs = log_file.read() - assert "INFO handle_migrations: Ran 0 migrations" in logs + endpoint.assert_log_contains("INFO handle_migrations: Ran 0 migrations") diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 0235cf6d20..696af24e5c 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -22,7 +22,7 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, PgBin, - S3Scrubber, + StorageScrubber, generate_uploads_and_deletions, ) from fixtures.pageserver.common_types import parse_layer_file_name @@ -215,7 +215,7 @@ def test_generations_upgrade(neon_env_builder: NeonEnvBuilder): # Having written a mixture of generation-aware and legacy index_part.json, # ensure the scrubber handles the situation as expected. - metadata_summary = S3Scrubber(neon_env_builder).scan_metadata() + metadata_summary = StorageScrubber(neon_env_builder).scan_metadata() assert metadata_summary["tenant_count"] == 1 # Scrubber should have seen our timeline assert metadata_summary["timeline_count"] == 1 assert metadata_summary["timeline_shard_count"] == 1 diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 5bfa9cce8c..2782d33e15 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -7,7 +7,7 @@ from typing import Any, Dict, Optional import pytest from fixtures.common_types import TenantId, TimelineId from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber +from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, StorageScrubber from fixtures.pageserver.common_types import parse_layer_file_name from fixtures.pageserver.utils import ( assert_prefix_empty, @@ -214,7 +214,7 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): # Having done a bunch of attach/detach cycles, we will have generated some index garbage: check # that the scrubber sees it and cleans it up. We do this before the final attach+validate pass, # to also validate that the scrubber isn't breaking anything. - gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1) + gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1) assert gc_summary["remote_storage_errors"] == 0 assert gc_summary["indices_deleted"] > 0 @@ -536,7 +536,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): # Scrub the remote storage # ======================== # This confirms that the scrubber isn't upset by the presence of the heatmap - S3Scrubber(neon_env_builder).scan_metadata() + StorageScrubber(neon_env_builder).scan_metadata() # Detach secondary and delete tenant # =================================== @@ -563,6 +563,7 @@ def test_secondary_downloads(neon_env_builder: NeonEnvBuilder): ) ), ) + workload.stop() def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py index 7fdabaaec7..6383d24c57 100644 --- a/test_runner/regress/test_s3_restore.py +++ b/test_runner/regress/test_s3_restore.py @@ -2,6 +2,7 @@ import time from datetime import datetime, timezone from fixtures.common_types import Lsn +from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, @@ -32,7 +33,12 @@ def test_tenant_s3_restore( assert remote_storage, "remote storage not configured" enable_remote_storage_versioning(remote_storage) - env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) + # change it back after initdb, recovery doesn't work if the two + # index_part.json uploads happen at same second or too close to each other. + initial_tenant_conf = MANY_SMALL_LAYERS_TENANT_CONFIG + del initial_tenant_conf["checkpoint_distance"] + + env = neon_env_builder.init_start(initial_tenant_conf) env.pageserver.allowed_errors.extend( [ # The deletion queue will complain when it encounters simulated S3 errors @@ -43,14 +49,16 @@ def test_tenant_s3_restore( ) ps_http = env.pageserver.http_client() - tenant_id = env.initial_tenant + # now lets create the small layers + ps_http.set_tenant_config(tenant_id, MANY_SMALL_LAYERS_TENANT_CONFIG) + # Default tenant and the one we created assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1 # create two timelines one being the parent of another, both with non-trivial data - parent = None + parent = "main" last_flush_lsns = [] for timeline in ["first", "second"]: @@ -64,6 +72,7 @@ def test_tenant_s3_restore( last_flush_lsns.append(last_flush_lsn) ps_http.timeline_checkpoint(tenant_id, timeline_id) wait_for_upload(ps_http, tenant_id, timeline_id, last_flush_lsn) + log.info(f"{timeline} timeline {timeline_id} {last_flush_lsn=}") parent = timeline # These sleeps are important because they fend off differences in clocks between us and S3 @@ -108,6 +117,9 @@ def test_tenant_s3_restore( ps_http.tenant_attach(tenant_id, generation=generation) env.pageserver.quiesce_tenants() + for tline in ps_http.timeline_list(env.initial_tenant): + log.info(f"timeline detail: {tline}") + for i, timeline in enumerate(["first", "second"]): with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint: endpoint.safe_psql(f"SELECT * FROM created_{timeline};") diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 390cbd242a..ae1ce3eb67 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -11,8 +11,8 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, - S3Scrubber, StorageControllerApiException, + StorageScrubber, last_flush_lsn_upload, tenant_get_shards, wait_for_last_flush_lsn, @@ -126,7 +126,7 @@ def test_sharding_smoke( # Check the scrubber isn't confused by sharded content, then disable # it during teardown because we'll have deleted by then - S3Scrubber(neon_env_builder).scan_metadata() + StorageScrubber(neon_env_builder).scan_metadata() neon_env_builder.scrub_on_exit = False env.storage_controller.pageserver_api().tenant_delete(tenant_id) @@ -693,6 +693,9 @@ def test_sharding_ingest_layer_sizes( # small checkpointing and compaction targets to ensure we generate many upload operations "checkpoint_distance": f"{expect_layer_size}", "compaction_target_size": f"{expect_layer_size}", + # aim to reduce flakyness, we are not doing explicit checkpointing + "compaction_period": "0s", + "gc_period": "0s", } shard_count = 4 neon_env_builder.num_pageservers = shard_count @@ -708,6 +711,23 @@ def test_sharding_ingest_layer_sizes( tenant_id = env.initial_tenant timeline_id = env.initial_timeline + # ignore the initdb layer(s) for the purposes of the size comparison as a initdb image layer optimization + # will produce a lot more smaller layers. + initial_layers_per_shard = {} + log.info("initdb distribution (not asserted on):") + for shard in env.storage_controller.locate(tenant_id): + pageserver = env.get_pageserver(shard["node_id"]) + shard_id = shard["shard_id"] + layers = ( + env.get_pageserver(shard["node_id"]).http_client().layer_map_info(shard_id, timeline_id) + ) + for layer in layers.historic_layers: + log.info( + f"layer[{pageserver.id}]: {layer.layer_file_name} (size {layer.layer_file_size})" + ) + + initial_layers_per_shard[shard_id] = set(layers.historic_layers) + workload = Workload(env, tenant_id, timeline_id) workload.init() workload.write_rows(4096, upload=False) @@ -729,7 +749,13 @@ def test_sharding_ingest_layer_sizes( historic_layers = sorted(layer_map.historic_layers, key=lambda layer: layer.lsn_start) + initial_layers = initial_layers_per_shard[shard_id] + for layer in historic_layers: + if layer in initial_layers: + # ignore the initdb image layers for the size histogram + continue + if layer.layer_file_size < expect_layer_size // 2: classification = "Small" small_layer_count += 1 @@ -759,7 +785,8 @@ def test_sharding_ingest_layer_sizes( pass else: # General case: - assert float(small_layer_count) / float(ok_layer_count) < 0.25 + # old limit was 0.25 but pg14 is right at the limit with 7/28 + assert float(small_layer_count) / float(ok_layer_count) < 0.3 # Each shard may emit up to one huge layer, because initdb ingest doesn't respect checkpoint_distance. assert huge_layer_count <= shard_count diff --git a/test_runner/regress/test_s3_scrubber.py b/test_runner/regress/test_storage_scrubber.py similarity index 94% rename from test_runner/regress/test_s3_scrubber.py rename to test_runner/regress/test_storage_scrubber.py index 6baba190f3..35ae61c380 100644 --- a/test_runner/regress/test_s3_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -6,7 +6,7 @@ import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.neon_fixtures import ( NeonEnvBuilder, - S3Scrubber, + StorageScrubber, ) from fixtures.remote_storage import S3Storage, s3_storage from fixtures.workload import Workload @@ -60,7 +60,7 @@ def test_scrubber_tenant_snapshot(neon_env_builder: NeonEnvBuilder, shard_count: output_path = neon_env_builder.test_output_dir / "snapshot" os.makedirs(output_path) - scrubber = S3Scrubber(neon_env_builder) + scrubber = StorageScrubber(neon_env_builder) scrubber.tenant_snapshot(tenant_id, output_path) assert len(os.listdir(output_path)) > 0 @@ -143,18 +143,18 @@ def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Opt workload.write_rows(1) # With a high min_age, the scrubber should decline to delete anything - gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=3600) + gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=3600) assert gc_summary["remote_storage_errors"] == 0 assert gc_summary["indices_deleted"] == 0 # If targeting a different tenant, the scrubber shouldn't do anything - gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc( + gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc( min_age_secs=1, tenant_ids=[TenantId.generate()] ) assert gc_summary["remote_storage_errors"] == 0 assert gc_summary["indices_deleted"] == 0 # With a low min_age, the scrubber should go ahead and clean up all but the latest 2 generations - gc_summary = S3Scrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1) + gc_summary = StorageScrubber(neon_env_builder).pageserver_physical_gc(min_age_secs=1) assert gc_summary["remote_storage_errors"] == 0 assert gc_summary["indices_deleted"] == (expect_indices_per_shard - 2) * shard_count diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index fa7cead1bd..fd3cc45c3f 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -10,7 +10,7 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, - S3Scrubber, + StorageScrubber, last_flush_lsn_upload, wait_for_last_flush_lsn, ) @@ -707,7 +707,7 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, neon_env_builder: NeonEnvBuilder) remote_storage_kind = RemoteStorageKind.MOCK_S3 neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) - scrubber = S3Scrubber(neon_env_builder) + scrubber = StorageScrubber(neon_env_builder) env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG) ps_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py index 89aa855352..6c762f3871 100644 --- a/test_runner/regress/test_vm_bits.py +++ b/test_runner/regress/test_vm_bits.py @@ -1,7 +1,9 @@ import time +from contextlib import closing from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, fork_at_current_lsn +from fixtures.utils import query_scalar # @@ -113,11 +115,90 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv): assert cur_new.fetchall() == [] -# -# Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK -# record. -# -def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder): +def test_vm_bit_clear_on_heap_lock_whitebox(neon_env_builder: NeonEnvBuilder): + """ + Test that the ALL_FROZEN VM bit is cleared correctly at a HEAP_LOCK record. + + This is a repro for the bug fixed in commit 66fa176cc8. + """ + # Debugging https://github.com/neondatabase/neon/issues/6967 + neon_env_builder.preserve_database_files = True + env = neon_env_builder.init_start() + endpoint = env.endpoints.create_start( + "main", + config_lines=[ + # If auto-analyze runs at the same time that we run VACUUM FREEZE, it + # can hold a snasphot that prevent the tuples from being frozen. + "autovacuum=off", + "log_checkpoints=on", + ], + ) + + # Run the tests in a dedicated database, because the activity monitor + # periodically runs some queries on to the 'postgres' database. If that + # happens at the same time that we're trying to freeze, the activity + # monitor's queries can hold back the xmin horizon and prevent freezing. + with closing(endpoint.connect()) as pg_conn: + pg_conn.cursor().execute("CREATE DATABASE vmbitsdb") + pg_conn = endpoint.connect(dbname="vmbitsdb") + cur = pg_conn.cursor() + + # Install extension containing function needed for test + cur.execute("CREATE EXTENSION neon_test_utils") + cur.execute("CREATE EXTENSION pageinspect") + + # Create a test table and freeze it to set the all-frozen VM bit on all pages. + cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)") + cur.execute("BEGIN") + cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g") + xid = int(query_scalar(cur, "SELECT txid_current()")) + cur.execute("COMMIT") + cur.execute("VACUUM (FREEZE, DISABLE_PAGE_SKIPPING true, VERBOSE) vmtest_lock") + for notice in pg_conn.notices: + log.info(f"{notice}") + + # This test has been flaky in the past, because background activity like + # auto-analyze and compute_ctl's activity monitor queries have prevented the + # tuples from being frozen. Check that they were frozen. + relfrozenxid = int( + query_scalar(cur, "SELECT relfrozenxid FROM pg_class WHERE relname='vmtest_lock'") + ) + assert ( + relfrozenxid > xid + ), f"Inserted rows were not frozen. This can be caused by concurrent activity in the database. (XID {xid}, relfrozenxid {relfrozenxid}" + + # Lock a row. This clears the all-frozen VM bit for that page. + cur.execute("BEGIN") + cur.execute("SELECT * FROM vmtest_lock WHERE id = 40000 FOR UPDATE") + cur.execute("COMMIT") + + # The VM page in shared buffer cache, and the same page as reconstructed by + # the pageserver, should be equal. Except for the LSN: Clearing a bit in the + # VM doesn't bump the LSN in PostgreSQL, but the pageserver updates the LSN + # when it replays the VM-bit clearing record (since commit 387a36874c) + # + # This is a bit fragile, we've had lot of flakiness in this test before. For + # example, because all the VM bits were not set because concurrent + # autoanalyze prevented the VACUUM FREEZE from freezing the tuples. Or + # because autoavacuum kicked in and re-froze the page between the + # get_raw_page() and get_raw_page_at_lsn() calls. We disable autovacuum now, + # which should make this deterministic. + cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )") + vm_page_in_cache = (cur.fetchall()[0][0])[8:100].hex() + cur.execute( + "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )" + ) + vm_page_at_pageserver = (cur.fetchall()[0][0])[8:100].hex() + + assert vm_page_at_pageserver == vm_page_in_cache + + +def test_vm_bit_clear_on_heap_lock_blackbox(neon_env_builder: NeonEnvBuilder): + """ + The previous test is enough to verify the bug that was fixed in + commit 66fa176cc8. But for good measure, we also reproduce the + original problem that the missing VM page update caused. + """ # Debugging https://github.com/neondatabase/neon/issues/6967 neon_env_builder.preserve_database_files = True @@ -133,9 +214,9 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf) tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_vm_bit_clear_on_heap_lock") + timeline_id = env.initial_timeline endpoint = env.endpoints.create_start( - "test_vm_bit_clear_on_heap_lock", + "main", config_lines=[ "log_autovacuum_min_duration = 0", # Perform anti-wraparound vacuuming aggressively @@ -149,12 +230,10 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder): # Install extension containing function needed for test cur.execute("CREATE EXTENSION neon_test_utils") - cur.execute("CREATE EXTENSION pageinspect") # Create a test table and freeze it to set the all-frozen VM bit on all pages. cur.execute("CREATE TABLE vmtest_lock (id integer PRIMARY KEY)") cur.execute("INSERT INTO vmtest_lock SELECT g FROM generate_series(1, 50000) g") - cur.execute("VACUUM (FREEZE, DISABLE_PAGE_SKIPPING true) vmtest_lock") # Lock a row. This clears the all-frozen VM bit for that page. @@ -168,27 +247,6 @@ def test_vm_bit_clear_on_heap_lock(neon_env_builder: NeonEnvBuilder): cur.execute("COMMIT") - # The VM page in shared buffer cache, and the same page as reconstructed - # by the pageserver, should be equal. - # - # Ignore page header (24 bytes) of visibility map. - # If the dirty VM page is flushed from the cache for some reason, - # it gets WAL-logged, which changes the LSN on the page. - # Also in neon SMGR we can replace empty heap page with zero (uninitialized) heap page. - cur.execute("select get_raw_page( 'vmtest_lock', 'vm', 0 )") - vm_page_in_cache = (cur.fetchall()[0][0])[24:100].hex() - cur.execute( - "select get_raw_page_at_lsn( 'vmtest_lock', 'vm', 0, pg_current_wal_insert_lsn(), NULL )" - ) - vm_page_at_pageserver = (cur.fetchall()[0][0])[24:100].hex() - - assert vm_page_at_pageserver == vm_page_in_cache - - # The above assert is enough to verify the bug that was fixed in - # commit 66fa176cc8. But for good measure, we also reproduce the - # original problem that the missing VM page update caused. The - # rest of the test does that. - # Kill and restart postgres, to clear the buffer cache. # # NOTE: clear_buffer_cache() will not do, because it evicts the dirty pages diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 17e0f5ff4e..4c51945a61 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 17e0f5ff4e1905691aa40e1e08f9b79b14c99652 +Subproject commit 4c51945a6167ca06c0169e7a4ca5a8e7ffa3faba diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index c2c3d40534..e22098d86d 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit c2c3d40534db97d83dd7e185d1971e707fa2f445 +Subproject commit e22098d86d6c40276b6bd75c29133a33fb283ab6 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index b228f20372..9837db1578 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit b228f20372ebcabfd7946647cb7adbd38bacb14a +Subproject commit 9837db157837fcf43ef7348be0017d3a2238cd27 diff --git a/vendor/revisions.json b/vendor/revisions.json index 5bf4e289ef..f945ea6d73 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "v16": ["16.3", "b228f20372ebcabfd7946647cb7adbd38bacb14a"], - "v15": ["15.7", "c2c3d40534db97d83dd7e185d1971e707fa2f445"], - "v14": ["14.12", "17e0f5ff4e1905691aa40e1e08f9b79b14c99652"] + "v16": ["16.3", "9837db157837fcf43ef7348be0017d3a2238cd27"], + "v15": ["15.7", "e22098d86d6c40276b6bd75c29133a33fb283ab6"], + "v14": ["14.12", "4c51945a6167ca06c0169e7a4ca5a8e7ffa3faba"] }