Compare commits

..

2 Commits

Author SHA1 Message Date
Erik Grinaker
a81df96db5 safekeeper: flush WAL on transaction commit 2024-11-11 17:48:32 +01:00
Erik Grinaker
71f04f91c2 safekeeper: send AppendResponse on segment flush 2024-11-11 11:24:43 +01:00
62 changed files with 531 additions and 1370 deletions

View File

@@ -20,4 +20,3 @@ config-variables:
- REMOTE_STORAGE_AZURE_REGION
- SLACK_UPCOMING_RELEASE_CHANNEL_ID
- DEV_AWS_OIDC_ROLE_ARN
- BENCHMARK_INGEST_TARGET_PROJECTID

View File

@@ -221,8 +221,6 @@ runs:
REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
script: |
const { REPORT_URL, COMMIT_SHA } = process.env

View File

@@ -1,3 +1,14 @@
## Problem
## Summary of changes
## Checklist before requesting a review
- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.
## Checklist before merging
- [ ] Do not forget to reformat commit message to not include the above checklist

View File

@@ -133,38 +133,38 @@ jobs:
- name: Check vendor/postgres-v14 submodule reference
if: steps.check-if-submodules-changed.outputs.vendor == 'true'
uses: hlinnaka/submodule-branch-check-action@main
uses: jtmullen/submodule-branch-check-action@v1
with:
path: "vendor/postgres-v14"
fetch_depth: "50"
sub_fetch_depth: ""
sub_fetch_depth: "50"
pass_if_unchanged: true
- name: Check vendor/postgres-v15 submodule reference
if: steps.check-if-submodules-changed.outputs.vendor == 'true'
uses: hlinnaka/submodule-branch-check-action@main
uses: jtmullen/submodule-branch-check-action@v1
with:
path: "vendor/postgres-v15"
fetch_depth: "50"
sub_fetch_depth: ""
sub_fetch_depth: "50"
pass_if_unchanged: true
- name: Check vendor/postgres-v16 submodule reference
if: steps.check-if-submodules-changed.outputs.vendor == 'true'
uses: hlinnaka/submodule-branch-check-action@main
uses: jtmullen/submodule-branch-check-action@v1
with:
path: "vendor/postgres-v16"
fetch_depth: "50"
sub_fetch_depth: ""
sub_fetch_depth: "50"
pass_if_unchanged: true
- name: Check vendor/postgres-v17 submodule reference
if: steps.check-if-submodules-changed.outputs.vendor == 'true'
uses: hlinnaka/submodule-branch-check-action@main
uses: jtmullen/submodule-branch-check-action@v1
with:
path: "vendor/postgres-v17"
fetch_depth: "50"
sub_fetch_depth: ""
sub_fetch_depth: "50"
pass_if_unchanged: true
check-codestyle-rust:
@@ -497,8 +497,6 @@ jobs:
REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
script: |
const { REPORT_URL_NEW, COMMIT_SHA } = process.env

View File

@@ -1,372 +0,0 @@
name: Benchmarking
on:
# uncomment to run on push for debugging your PR
# push:
# branches: [ your branch ]
schedule:
# * is a special character in YAML so you have to quote this string
# ┌───────────── minute (0 - 59)
# │ ┌───────────── hour (0 - 23)
# │ │ ┌───────────── day of the month (1 - 31)
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
- cron: '0 9 * * *' # run once a day, timezone is utc
workflow_dispatch: # adds ability to run this manually
defaults:
run:
shell: bash -euxo pipefail {0}
concurrency:
# Allow only one workflow globally because we need dedicated resources which only exist once
group: ingest-bench-workflow
cancel-in-progress: true
jobs:
ingest:
strategy:
matrix:
target_project: [new_empty_project, large_existing_project]
permissions:
contents: write
statuses: write
id-token: write # aws-actions/configure-aws-credentials
env:
PG_CONFIG: /tmp/neon/pg_install/v16/bin/pg_config
PSQL: /tmp/neon/pg_install/v16/bin/psql
PG_16_LIB_PATH: /tmp/neon/pg_install/v16/lib
PGCOPYDB: /pgcopydb/bin/pgcopydb
PGCOPYDB_LIB_PATH: /pgcopydb/lib
runs-on: [ self-hosted, us-east-2, x64 ]
container:
image: neondatabase/build-tools:pinned-bookworm
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
timeout-minutes: 1440
steps:
- uses: actions/checkout@v4
- name: Configure AWS credentials # necessary to download artefacts
uses: aws-actions/configure-aws-credentials@v4
with:
aws-region: eu-central-1
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
path: /tmp/neon/
prefix: latest
- name: Create Neon Project
if: ${{ matrix.target_project == 'new_empty_project' }}
id: create-neon-project-ingest-target
uses: ./.github/actions/neon-project-create
with:
region_id: aws-us-east-2
postgres_version: 16
compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Initialize Neon project and retrieve current backpressure seconds
if: ${{ matrix.target_project == 'new_empty_project' }}
env:
NEW_PROJECT_CONNSTR: ${{ steps.create-neon-project-ingest-target.outputs.dsn }}
NEW_PROJECT_ID: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
run: |
echo "Initializing Neon project with project_id: ${NEW_PROJECT_ID}"
export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV
echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV
- name: Create Neon Branch for large tenant
if: ${{ matrix.target_project == 'large_existing_project' }}
id: create-neon-branch-ingest-target
uses: ./.github/actions/neon-branch-create
with:
project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Initialize Neon project and retrieve current backpressure seconds
if: ${{ matrix.target_project == 'large_existing_project' }}
env:
NEW_PROJECT_CONNSTR: ${{ steps.create-neon-branch-ingest-target.outputs.dsn }}
NEW_BRANCH_ID: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }}
run: |
echo "Initializing Neon branch with branch_id: ${NEW_BRANCH_ID}"
export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
# Extract the part before the database name
base_connstr="${NEW_PROJECT_CONNSTR%/*}"
# Extract the query parameters (if any) after the database name
query_params="${NEW_PROJECT_CONNSTR#*\?}"
# Reconstruct the new connection string
if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then
new_connstr="${base_connstr}/neondb?${query_params}"
else
new_connstr="${base_connstr}/neondb"
fi
${PSQL} "${new_connstr}" -c "drop database ludicrous;"
${PSQL} "${new_connstr}" -c "CREATE DATABASE ludicrous;"
if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then
NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous?${query_params}"
else
NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous"
fi
${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV
echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV
- name: Create pgcopydb filter file
run: |
cat << EOF > /tmp/pgcopydb_filter.txt
[include-only-table]
public.events
public.emails
public.email_transmissions
public.payments
public.editions
public.edition_modules
public.sp_content
public.email_broadcasts
public.user_collections
public.devices
public.user_accounts
public.lessons
public.lesson_users
public.payment_methods
public.orders
public.course_emails
public.modules
public.users
public.module_users
public.courses
public.payment_gateway_keys
public.accounts
public.roles
public.payment_gateways
public.management
public.event_names
EOF
- name: Invoke pgcopydb
env:
BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }}
run: |
export LD_LIBRARY_PATH=${PGCOPYDB_LIB_PATH}:${PG_16_LIB_PATH}
export PGCOPYDB_SOURCE_PGURI="${BENCHMARK_INGEST_SOURCE_CONNSTR}"
export PGCOPYDB_TARGET_PGURI="${NEW_PROJECT_CONNSTR}"
export PGOPTIONS="-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
${PG_CONFIG} --bindir
${PGCOPYDB} --version
${PGCOPYDB} clone --skip-vacuum --no-owner --no-acl --skip-db-properties --table-jobs 4 \
--index-jobs 4 --restore-jobs 4 --split-tables-larger-than 10GB --skip-extensions \
--use-copy-binary --filters /tmp/pgcopydb_filter.txt 2>&1 | tee /tmp/pgcopydb_${{ matrix.target_project }}.log
# create dummy pgcopydb log to test parsing
# - name: create dummy log for parser test
# run: |
# cat << EOF > /tmp/pgcopydb_${{ matrix.target_project }}.log
# 2024-11-04 18:00:53.433 500861 INFO main.c:136 Running pgcopydb version 0.17.10.g8361a93 from "/usr/lib/postgresql/17/bin/pgcopydb"
# 2024-11-04 18:00:53.434 500861 INFO cli_common.c:1225 [SOURCE] Copying database from "postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60"
# 2024-11-04 18:00:53.434 500861 INFO cli_common.c:1226 [TARGET] Copying database into "postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60"
# 2024-11-04 18:00:53.442 500861 INFO copydb.c:105 Using work dir "/tmp/pgcopydb"
# 2024-11-04 18:00:53.541 500861 INFO snapshot.c:107 Exported snapshot "00000008-00000033-1" from the source database
# 2024-11-04 18:00:53.556 500865 INFO cli_clone_follow.c:543 STEP 1: fetch source database tables, indexes, and sequences
# 2024-11-04 18:00:54.570 500865 INFO copydb_schema.c:716 Splitting source candidate tables larger than 10 GB
# 2024-11-04 18:00:54.570 500865 INFO copydb_schema.c:829 Table public.events is 96 GB large which is larger than --split-tables-larger-than 10 GB, and does not have a unique column of type integer: splitting by CTID
# 2024-11-04 18:01:05.538 500865 INFO copydb_schema.c:905 Table public.events is 96 GB large, 10 COPY processes will be used, partitioning on ctid.
# 2024-11-04 18:01:05.564 500865 INFO copydb_schema.c:905 Table public.email_transmissions is 27 GB large, 4 COPY processes will be used, partitioning on id.
# 2024-11-04 18:01:05.584 500865 INFO copydb_schema.c:905 Table public.lessons is 25 GB large, 4 COPY processes will be used, partitioning on id.
# 2024-11-04 18:01:05.605 500865 INFO copydb_schema.c:905 Table public.lesson_users is 16 GB large, 3 COPY processes will be used, partitioning on id.
# 2024-11-04 18:01:05.605 500865 INFO copydb_schema.c:761 Fetched information for 26 tables (including 4 tables split in 21 partitions total), with an estimated total of 907 million tuples and 175 GB on-disk
# 2024-11-04 18:01:05.687 500865 INFO copydb_schema.c:968 Fetched information for 57 indexes (supporting 25 constraints)
# 2024-11-04 18:01:05.753 500865 INFO sequences.c:78 Fetching information for 24 sequences
# 2024-11-04 18:01:05.903 500865 INFO copydb_schema.c:1122 Fetched information for 4 extensions
# 2024-11-04 18:01:06.178 500865 INFO copydb_schema.c:1538 Found 0 indexes (supporting 0 constraints) in the target database
# 2024-11-04 18:01:06.184 500865 INFO cli_clone_follow.c:584 STEP 2: dump the source database schema (pre/post data)
# 2024-11-04 18:01:06.186 500865 INFO pgcmd.c:468 /usr/lib/postgresql/16/bin/pg_dump -Fc --snapshot 00000008-00000033-1 --section=pre-data --section=post-data --file /tmp/pgcopydb/schema/schema.dump 'postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60'
# 2024-11-04 18:01:06.952 500865 INFO cli_clone_follow.c:592 STEP 3: restore the pre-data section to the target database
# 2024-11-04 18:01:07.004 500865 INFO pgcmd.c:1001 /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section pre-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/pre-filtered.list /tmp/pgcopydb/schema/schema.dump
# 2024-11-04 18:01:07.438 500874 INFO table-data.c:656 STEP 4: starting 4 table-data COPY processes
# 2024-11-04 18:01:07.451 500877 INFO vacuum.c:139 STEP 8: skipping VACUUM jobs per --skip-vacuum
# 2024-11-04 18:01:07.457 500875 INFO indexes.c:182 STEP 6: starting 4 CREATE INDEX processes
# 2024-11-04 18:01:07.457 500875 INFO indexes.c:183 STEP 7: constraints are built by the CREATE INDEX processes
# 2024-11-04 18:01:07.507 500865 INFO blobs.c:74 Skipping large objects: none found.
# 2024-11-04 18:01:07.509 500865 INFO sequences.c:194 STEP 9: reset sequences values
# 2024-11-04 18:01:07.510 500886 INFO sequences.c:290 Set sequences values on the target database
# 2024-11-04 20:49:00.587 500865 INFO cli_clone_follow.c:608 STEP 10: restore the post-data section to the target database
# 2024-11-04 20:49:00.600 500865 INFO pgcmd.c:1001 /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section post-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/post-filtered.list /tmp/pgcopydb/schema/schema.dump
# 2024-11-05 10:50:58.508 500865 INFO cli_clone_follow.c:639 All step are now done, 16h49m elapsed
# 2024-11-05 10:50:58.508 500865 INFO summary.c:3155 Printing summary for 26 tables and 57 indexes
# OID | Schema | Name | Parts | copy duration | transmitted bytes | indexes | create index duration
# ------+--------+----------------------+-------+---------------+-------------------+---------+----------------------
# 24654 | public | events | 10 | 1d11h | 878 GB | 1 | 1h41m
# 24623 | public | email_transmissions | 4 | 4h46m | 99 GB | 3 | 2h04m
# 24665 | public | lessons | 4 | 4h42m | 161 GB | 4 | 1m11s
# 24661 | public | lesson_users | 3 | 2h46m | 49 GB | 3 | 39m35s
# 24631 | public | emails | 1 | 34m07s | 10 GB | 2 | 17s
# 24739 | public | payments | 1 | 5m47s | 1848 MB | 4 | 4m40s
# 24681 | public | module_users | 1 | 4m57s | 1610 MB | 3 | 1m50s
# 24694 | public | orders | 1 | 2m50s | 835 MB | 3 | 1m05s
# 24597 | public | devices | 1 | 1m45s | 498 MB | 2 | 40s
# 24723 | public | payment_methods | 1 | 1m24s | 548 MB | 2 | 31s
# 24765 | public | user_collections | 1 | 2m17s | 1005 MB | 2 | 968ms
# 24774 | public | users | 1 | 52s | 291 MB | 4 | 27s
# 24760 | public | user_accounts | 1 | 16s | 172 MB | 3 | 16s
# 24606 | public | edition_modules | 1 | 8s983 | 46 MB | 3 | 4s749
# 24583 | public | course_emails | 1 | 8s526 | 26 MB | 2 | 996ms
# 24685 | public | modules | 1 | 1s592 | 21 MB | 3 | 1s696
# 24610 | public | editions | 1 | 2s199 | 7483 kB | 2 | 1s032
# 24755 | public | sp_content | 1 | 1s555 | 4177 kB | 0 | 0ms
# 24619 | public | email_broadcasts | 1 | 744ms | 2645 kB | 2 | 677ms
# 24590 | public | courses | 1 | 387ms | 1540 kB | 2 | 367ms
# 24704 | public | payment_gateway_keys | 1 | 1s972 | 164 kB | 2 | 27ms
# 24576 | public | accounts | 1 | 58ms | 24 kB | 1 | 14ms
# 24647 | public | event_names | 1 | 32ms | 397 B | 1 | 8ms
# 24716 | public | payment_gateways | 1 | 1s675 | 117 B | 1 | 11ms
# 24748 | public | roles | 1 | 71ms | 173 B | 1 | 8ms
# 24676 | public | management | 1 | 33ms | 40 B | 1 | 19ms
# Step Connection Duration Transfer Concurrency
# -------------------------------------------------- ---------- ---------- ---------- ------------
# Catalog Queries (table ordering, filtering, etc) source 12s 1
# Dump Schema source 765ms 1
# Prepare Schema target 466ms 1
# COPY, INDEX, CONSTRAINTS, VACUUM (wall clock) both 2h47m 12
# COPY (cumulative) both 7h46m 1225 GB 4
# CREATE INDEX (cumulative) target 4h36m 4
# CONSTRAINTS (cumulative) target 8s493 4
# VACUUM (cumulative) target 0ms 4
# Reset Sequences both 60ms 1
# Large Objects (cumulative) (null) 0ms 0
# Finalize Schema both 14h01m 4
# -------------------------------------------------- ---------- ---------- ---------- ------------
# Total Wall Clock Duration both 16h49m 20
# EOF
- name: show tables sizes and retrieve current backpressure seconds
run: |
export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
${PSQL} "${NEW_PROJECT_CONNSTR}" -c "\dt+"
BACKPRESSURE_TIME_AFTER_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
echo "BACKPRESSURE_TIME_AFTER_INGEST=${BACKPRESSURE_TIME_AFTER_INGEST}" >> $GITHUB_ENV
- name: Parse pgcopydb log and report performance metrics
env:
PERF_TEST_RESULT_CONNSTR: ${{ secrets.PERF_TEST_RESULT_CONNSTR }}
run: |
export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
# Define the log file path
LOG_FILE="/tmp/pgcopydb_${{ matrix.target_project }}.log"
# Get the current git commit hash
git config --global --add safe.directory /__w/neon/neon
COMMIT_HASH=$(git rev-parse --short HEAD)
# Define the platform and test suite
PLATFORM="pg16-${{ matrix.target_project }}-us-east-2-staging"
SUIT="pgcopydb_ingest_bench"
# Function to convert time (e.g., "2h47m", "4h36m", "118ms", "8s493") to seconds
convert_to_seconds() {
local duration=$1
local total_seconds=0
# Check for hours (h)
if [[ "$duration" =~ ([0-9]+)h ]]; then
total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 3600))
fi
# Check for seconds (s)
if [[ "$duration" =~ ([0-9]+)s ]]; then
total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0}))
fi
# Check for milliseconds (ms) (if applicable)
if [[ "$duration" =~ ([0-9]+)ms ]]; then
total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} / 1000))
duration=${duration/${BASH_REMATCH[0]}/} # need to remove it to avoid double counting with m
fi
# Check for minutes (m) - must be checked after ms because m is contained in ms
if [[ "$duration" =~ ([0-9]+)m ]]; then
total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 60))
fi
echo $total_seconds
}
# Calculate the backpressure difference in seconds
BACKPRESSURE_TIME_DIFF=$(awk "BEGIN {print $BACKPRESSURE_TIME_AFTER_INGEST - $BACKPRESSURE_TIME_BEFORE_INGEST}")
# Insert the backpressure time difference into the performance database
if [ -n "$BACKPRESSURE_TIME_DIFF" ]; then
PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \"
INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp)
VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', 'backpressure_time', ${BACKPRESSURE_TIME_DIFF}, 'seconds', 'lower_is_better', now());
\""
echo "Inserting backpressure time difference: ${BACKPRESSURE_TIME_DIFF} seconds"
eval $PSQL_CMD
fi
# Extract and process log lines
while IFS= read -r line; do
METRIC_NAME=""
# Match each desired line and extract the relevant information
if [[ "$line" =~ COPY,\ INDEX,\ CONSTRAINTS,\ VACUUM.* ]]; then
METRIC_NAME="COPY, INDEX, CONSTRAINTS, VACUUM (wall clock)"
elif [[ "$line" =~ COPY\ \(cumulative\).* ]]; then
METRIC_NAME="COPY (cumulative)"
elif [[ "$line" =~ CREATE\ INDEX\ \(cumulative\).* ]]; then
METRIC_NAME="CREATE INDEX (cumulative)"
elif [[ "$line" =~ CONSTRAINTS\ \(cumulative\).* ]]; then
METRIC_NAME="CONSTRAINTS (cumulative)"
elif [[ "$line" =~ Finalize\ Schema.* ]]; then
METRIC_NAME="Finalize Schema"
elif [[ "$line" =~ Total\ Wall\ Clock\ Duration.* ]]; then
METRIC_NAME="Total Wall Clock Duration"
fi
# If a metric was matched, insert it into the performance database
if [ -n "$METRIC_NAME" ]; then
DURATION=$(echo "$line" | grep -oP '\d+h\d+m|\d+s|\d+ms|\d{1,2}h\d{1,2}m|\d+\.\d+s' | head -n 1)
METRIC_VALUE=$(convert_to_seconds "$DURATION")
PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \"
INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp)
VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', '${METRIC_NAME}', ${METRIC_VALUE}, 'seconds', 'lower_is_better', now());
\""
echo "Inserting ${METRIC_NAME} with value ${METRIC_VALUE} seconds"
eval $PSQL_CMD
fi
done < "$LOG_FILE"
- name: Delete Neon Project
if: ${{ always() && matrix.target_project == 'new_empty_project' }}
uses: ./.github/actions/neon-project-delete
with:
project_id: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
- name: Delete Neon Branch for large tenant
if: ${{ always() && matrix.target_project == 'large_existing_project' }}
uses: ./.github/actions/neon-branch-delete
with:
project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }}
branch_id: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}

View File

@@ -201,8 +201,6 @@ jobs:
REPORT_URL: ${{ steps.upload-stats.outputs.report-url }}
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
with:
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
retries: 5
script: |
const { REPORT_URL, SHA } = process.env

View File

@@ -1,29 +0,0 @@
name: Report Workflow Stats Batch
on:
schedule:
- cron: '*/15 * * * *'
- cron: '25 0 * * *'
jobs:
gh-workflow-stats-batch:
name: GitHub Workflow Stats Batch
runs-on: ubuntu-22.04
permissions:
actions: read
steps:
- name: Export Workflow Run for the past 2 hours
uses: neondatabase/gh-workflow-stats-action@v0.2.1
with:
db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
db_table: "gh_workflow_stats_batch_neon"
gh_token: ${{ secrets.GITHUB_TOKEN }}
duration: '2h'
- name: Export Workflow Run for the past 24 hours
if: github.event.schedule == '25 0 * * *'
uses: neondatabase/gh-workflow-stats-action@v0.2.1
with:
db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
db_table: "gh_workflow_stats_batch_neon"
gh_token: ${{ secrets.GITHUB_TOKEN }}
duration: '24h'

View File

@@ -624,12 +624,16 @@ FROM build-deps AS pg-cron-pg-build
ARG PG_VERSION
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
# 1.6.4 available, supports v17
# This is an experimental extension that we do not support on prod yet.
# !Do not remove!
# We set it in shared_preload_libraries and computes will fail to start if library is not found.
ENV PATH="/usr/local/pgsql/bin/:$PATH"
RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \
echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \
RUN case "${PG_VERSION}" in "v17") \
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
esac && \
wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
make -j $(getconf _NPROCESSORS_ONLN) && \
make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -1471,8 +1475,6 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter
COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml

View File

@@ -26,7 +26,7 @@ commands:
- name: postgres-exporter
user: nobody
sysvInitAction: respawn
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
- name: sql-exporter
user: nobody
sysvInitAction: respawn

View File

@@ -26,7 +26,7 @@ commands:
- name: postgres-exporter
user: nobody
sysvInitAction: respawn
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
- name: sql-exporter
user: nobody
sysvInitAction: respawn

View File

@@ -74,17 +74,10 @@ pub fn write_postgres_conf(
}
// Locales
if cfg!(target_os = "macos") {
writeln!(file, "lc_messages='C'")?;
writeln!(file, "lc_monetary='C'")?;
writeln!(file, "lc_time='C'")?;
writeln!(file, "lc_numeric='C'")?;
} else {
writeln!(file, "lc_messages='C.UTF-8'")?;
writeln!(file, "lc_monetary='C.UTF-8'")?;
writeln!(file, "lc_time='C.UTF-8'")?;
writeln!(file, "lc_numeric='C.UTF-8'")?;
}
writeln!(file, "lc_messages='C.UTF-8'")?;
writeln!(file, "lc_monetary='C.UTF-8'")?;
writeln!(file, "lc_time='C.UTF-8'")?;
writeln!(file, "lc_numeric='C.UTF-8'")?;
match spec.mode {
ComputeMode::Primary => {}

View File

@@ -91,7 +91,7 @@ generating the basebackup by scanning the `REPL_ORIGIN_KEY_PREFIX` keyspace.
There are two places we need to read the aux files from the pageserver:
* On the write path, when the compute node adds an aux file to the pageserver, we will retrieve the key from the storage, append the file to the hashed key, and write it back. The current `get` API already supports that.
* We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API used to always attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error. Furthermore, as aux file reads usually need all layer files intersecting with that key range within the branch and cover a big keyspace, it incurs large overhead for tracking keyspaces that have not been read. Therefore, for sparse keyspaces, we [do not track](https://github.com/neondatabase/neon/pull/9631) `ummapped_keyspace`.
* We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API will attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error.
## Compaction and Image Layer Generation

View File

@@ -277,11 +277,7 @@ pub mod defaults {
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
pub const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
"C"
} else {
"C.UTF-8"
};
pub const DEFAULT_LOCALE: &str = "C.UTF-8";
pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;

View File

@@ -14,7 +14,7 @@ use super::bindings::{
};
use super::wal_generator::LogicalMessageGenerator;
use super::PG_MAJORVERSION;
use crate::pg_constants;
use crate::pg_constants::{self, XLOG_XACT_COMMIT, XLOG_XACT_COMMIT_PREPARED};
use crate::PG_TLI;
use crate::{uint32, uint64, Oid};
use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
@@ -296,6 +296,13 @@ impl XLogRecord {
pub fn is_xlog_switch_record(&self) -> bool {
self.xl_info == pg_constants::XLOG_SWITCH && self.xl_rmid == pg_constants::RM_XLOG_ID
}
// Is this record a transaction commit?
pub fn is_xact_commit(&self) -> bool {
self.xl_rmid == pg_constants::RM_XACT_ID
&& (self.xl_info & pg_constants::XLOG_XACT_OPMASK == XLOG_XACT_COMMIT
|| self.xl_info & pg_constants::XLOG_XACT_OPMASK == XLOG_XACT_COMMIT_PREPARED)
}
}
impl XLogPageHeaderData {

View File

@@ -40,11 +40,6 @@ pub enum Scope {
/// Allows access to storage controller APIs used by the scrubber, to interrogate the state
/// of a tenant & post scrub results.
Scrubber,
/// This scope is used for communication with other storage controller instances.
/// At the time of writing, this is only used for the step down request.
#[serde(rename = "controller_peer")]
ControllerPeer,
}
/// JWT payload. See docs/authentication.md for the format

View File

@@ -35,15 +35,6 @@ pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
!(a.end <= b.start || b.end <= a.start)
}
/// Whether a fully contains b, example as below
/// ```plain
/// | a |
/// | b |
/// ```
pub fn fully_contains<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
a.start <= b.start && a.end >= b.end
}
pub fn union_to_keyspace<K: Ord>(a: &mut CompactionKeySpace<K>, b: CompactionKeySpace<K>) {
let x = std::mem::take(a);
let mut all_ranges_iter = [x.into_iter(), b.into_iter()]

View File

@@ -19,8 +19,7 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
| Scope::SafekeeperData
| Scope::GenerationsApi
| Scope::Infra
| Scope::Scrubber
| Scope::ControllerPeer,
| Scope::Scrubber,
_,
) => Err(AuthError(
format!(

View File

@@ -4786,6 +4786,12 @@ async fn run_initdb(
.args(["--username", &conf.superuser])
.args(["--encoding", "utf8"])
.args(["--locale", &conf.locale])
.args(["--lc-collate", &conf.locale])
.args(["--lc-ctype", &conf.locale])
.args(["--lc-messages", &conf.locale])
.args(["--lc-monetary", &conf.locale])
.args(["--lc-numeric", &conf.locale])
.args(["--lc-time", &conf.locale])
.arg("--no-instructions")
.arg("--no-sync")
.env_clear()
@@ -9223,23 +9229,6 @@ mod tests {
Ok(())
}
fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
(
k1.is_delta,
k1.key_range.start,
k1.key_range.end,
k1.lsn_range.start,
k1.lsn_range.end,
)
.cmp(&(
k2.is_delta,
k2.key_range.start,
k2.key_range.end,
k2.lsn_range.start,
k2.lsn_range.end,
))
}
async fn inspect_and_sort(
tline: &Arc<Timeline>,
filter: Option<std::ops::Range<Key>>,
@@ -9248,30 +9237,25 @@ mod tests {
if let Some(filter) = filter {
all_layers.retain(|layer| overlaps_with(&layer.key_range, &filter));
}
all_layers.sort_by(sort_layer_key);
all_layers.sort_by(|k1, k2| {
(
k1.is_delta,
k1.key_range.start,
k1.key_range.end,
k1.lsn_range.start,
k1.lsn_range.end,
)
.cmp(&(
k2.is_delta,
k2.key_range.start,
k2.key_range.end,
k2.lsn_range.start,
k2.lsn_range.end,
))
});
all_layers
}
#[cfg(feature = "testing")]
fn check_layer_map_key_eq(
mut left: Vec<PersistentLayerKey>,
mut right: Vec<PersistentLayerKey>,
) {
left.sort_by(sort_layer_key);
right.sort_by(sort_layer_key);
if left != right {
eprintln!("---LEFT---");
for left in left.iter() {
eprintln!("{}", left);
}
eprintln!("---RIGHT---");
for right in right.iter() {
eprintln!("{}", right);
}
assert_eq!(left, right);
}
}
#[cfg(feature = "testing")]
#[tokio::test]
async fn test_simple_partial_bottom_most_compaction() -> anyhow::Result<()> {
@@ -9364,206 +9348,127 @@ mod tests {
let cancel = CancellationToken::new();
// Do a partial compaction on key range 0..2
// Do a partial compaction on key range 0..4, we should generate a image layer; no other layers
// can be removed because they might be used for other key ranges.
tline
.partial_compact_with_gc(get_key(0)..get_key(2), &cancel, EnumSet::new(), &ctx)
.partial_compact_with_gc(Some(get_key(0)..get_key(4)), &cancel, EnumSet::new(), &ctx)
.await
.unwrap();
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
check_layer_map_key_eq(
assert_eq!(
all_layers,
vec![
// newly-generated image layer for the partial compaction range 0-2
PersistentLayerKey {
key_range: get_key(0)..get_key(2),
key_range: get_key(0)..get_key(4),
lsn_range: Lsn(0x20)..Lsn(0x21),
is_delta: false,
is_delta: false
},
PersistentLayerKey {
key_range: get_key(0)..get_key(10),
lsn_range: Lsn(0x10)..Lsn(0x11),
is_delta: false,
is_delta: false
},
// delta1 is split and the second part is rewritten
PersistentLayerKey {
key_range: get_key(2)..get_key(4),
key_range: get_key(1)..get_key(4),
lsn_range: Lsn(0x20)..Lsn(0x48),
is_delta: true,
is_delta: true
},
PersistentLayerKey {
key_range: get_key(5)..get_key(7),
lsn_range: Lsn(0x20)..Lsn(0x48),
is_delta: true,
is_delta: true
},
PersistentLayerKey {
key_range: get_key(8)..get_key(10),
lsn_range: Lsn(0x48)..Lsn(0x50),
is_delta: true,
},
],
is_delta: true
}
]
);
// Do a partial compaction on key range 2..4
// Do a partial compaction on key range 4..10
tline
.partial_compact_with_gc(get_key(2)..get_key(4), &cancel, EnumSet::new(), &ctx)
.partial_compact_with_gc(Some(get_key(4)..get_key(10)), &cancel, EnumSet::new(), &ctx)
.await
.unwrap();
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
check_layer_map_key_eq(
assert_eq!(
all_layers,
vec![
PersistentLayerKey {
key_range: get_key(0)..get_key(2),
key_range: get_key(0)..get_key(4),
lsn_range: Lsn(0x20)..Lsn(0x21),
is_delta: false,
is_delta: false
},
PersistentLayerKey {
// if (in the future) GC kicks in, this layer will be removed
key_range: get_key(0)..get_key(10),
lsn_range: Lsn(0x10)..Lsn(0x11),
is_delta: false,
is_delta: false
},
// image layer generated for the compaction range 2-4
PersistentLayerKey {
key_range: get_key(2)..get_key(4),
key_range: get_key(4)..get_key(10),
lsn_range: Lsn(0x20)..Lsn(0x21),
is_delta: false,
is_delta: false
},
// we have key2/key3 above the retain_lsn, so we still need this delta layer
PersistentLayerKey {
key_range: get_key(2)..get_key(4),
key_range: get_key(1)..get_key(4),
lsn_range: Lsn(0x20)..Lsn(0x48),
is_delta: true,
is_delta: true
},
PersistentLayerKey {
key_range: get_key(5)..get_key(7),
lsn_range: Lsn(0x20)..Lsn(0x48),
is_delta: true,
is_delta: true
},
PersistentLayerKey {
key_range: get_key(8)..get_key(10),
lsn_range: Lsn(0x48)..Lsn(0x50),
is_delta: true,
},
],
);
// Do a partial compaction on key range 4..9
tline
.partial_compact_with_gc(get_key(4)..get_key(9), &cancel, EnumSet::new(), &ctx)
.await
.unwrap();
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
check_layer_map_key_eq(
all_layers,
vec![
PersistentLayerKey {
key_range: get_key(0)..get_key(2),
lsn_range: Lsn(0x20)..Lsn(0x21),
is_delta: false,
},
PersistentLayerKey {
key_range: get_key(0)..get_key(10),
lsn_range: Lsn(0x10)..Lsn(0x11),
is_delta: false,
},
PersistentLayerKey {
key_range: get_key(2)..get_key(4),
lsn_range: Lsn(0x20)..Lsn(0x21),
is_delta: false,
},
PersistentLayerKey {
key_range: get_key(2)..get_key(4),
lsn_range: Lsn(0x20)..Lsn(0x48),
is_delta: true,
},
// image layer generated for this compaction range
PersistentLayerKey {
key_range: get_key(4)..get_key(9),
lsn_range: Lsn(0x20)..Lsn(0x21),
is_delta: false,
},
PersistentLayerKey {
key_range: get_key(8)..get_key(10),
lsn_range: Lsn(0x48)..Lsn(0x50),
is_delta: true,
},
],
);
// Do a partial compaction on key range 9..10
tline
.partial_compact_with_gc(get_key(9)..get_key(10), &cancel, EnumSet::new(), &ctx)
.await
.unwrap();
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
check_layer_map_key_eq(
all_layers,
vec![
PersistentLayerKey {
key_range: get_key(0)..get_key(2),
lsn_range: Lsn(0x20)..Lsn(0x21),
is_delta: false,
},
PersistentLayerKey {
key_range: get_key(0)..get_key(10),
lsn_range: Lsn(0x10)..Lsn(0x11),
is_delta: false,
},
PersistentLayerKey {
key_range: get_key(2)..get_key(4),
lsn_range: Lsn(0x20)..Lsn(0x21),
is_delta: false,
},
PersistentLayerKey {
key_range: get_key(2)..get_key(4),
lsn_range: Lsn(0x20)..Lsn(0x48),
is_delta: true,
},
PersistentLayerKey {
key_range: get_key(4)..get_key(9),
lsn_range: Lsn(0x20)..Lsn(0x21),
is_delta: false,
},
// image layer generated for the compaction range
PersistentLayerKey {
key_range: get_key(9)..get_key(10),
lsn_range: Lsn(0x20)..Lsn(0x21),
is_delta: false,
},
PersistentLayerKey {
key_range: get_key(8)..get_key(10),
lsn_range: Lsn(0x48)..Lsn(0x50),
is_delta: true,
},
],
is_delta: true
}
]
);
// Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones.
tline
.partial_compact_with_gc(get_key(0)..get_key(10), &cancel, EnumSet::new(), &ctx)
.partial_compact_with_gc(Some(get_key(0)..get_key(10)), &cancel, EnumSet::new(), &ctx)
.await
.unwrap();
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
check_layer_map_key_eq(
assert_eq!(
all_layers,
vec![
// aha, we removed all unnecessary image/delta layers and got a very clean layer map!
PersistentLayerKey {
key_range: get_key(0)..get_key(4),
lsn_range: Lsn(0x20)..Lsn(0x21),
is_delta: false
},
PersistentLayerKey {
key_range: get_key(0)..get_key(10),
lsn_range: Lsn(0x20)..Lsn(0x21),
is_delta: false,
is_delta: false
},
PersistentLayerKey {
key_range: get_key(2)..get_key(4),
key_range: get_key(4)..get_key(10),
lsn_range: Lsn(0x20)..Lsn(0x21),
is_delta: false
},
PersistentLayerKey {
key_range: get_key(1)..get_key(4),
lsn_range: Lsn(0x20)..Lsn(0x48),
is_delta: true,
is_delta: true
},
PersistentLayerKey {
key_range: get_key(5)..get_key(7),
lsn_range: Lsn(0x20)..Lsn(0x48),
is_delta: true
},
PersistentLayerKey {
key_range: get_key(8)..get_key(10),
lsn_range: Lsn(0x48)..Lsn(0x50),
is_delta: true,
},
],
is_delta: true
}
]
);
Ok(())

View File

@@ -653,10 +653,6 @@ impl DeltaLayerWriter {
})
}
pub fn is_empty(&self) -> bool {
self.inner.as_ref().unwrap().num_keys == 0
}
///
/// Append a key-value pair to the file.
///

View File

@@ -1,4 +1,4 @@
use std::{ops::Range, sync::Arc};
use std::ops::Range;
use anyhow::bail;
use pageserver_api::{
@@ -9,10 +9,7 @@ use utils::lsn::Lsn;
use pageserver_api::value::Value;
use super::{
merge_iterator::{MergeIterator, MergeIteratorItem},
PersistentLayerKey,
};
use super::merge_iterator::MergeIterator;
/// A filter iterator over merge iterators (and can be easily extended to other types of iterators).
///
@@ -51,10 +48,10 @@ impl<'a> FilterIterator<'a> {
})
}
async fn next_inner<R: MergeIteratorItem>(&mut self) -> anyhow::Result<Option<R>> {
while let Some(item) = self.inner.next_inner::<R>().await? {
pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
while let Some(item) = self.inner.next().await? {
while self.current_filter_idx < self.retain_key_filters.len()
&& item.key_lsn_value().0 >= self.retain_key_filters[self.current_filter_idx].end
&& item.0 >= self.retain_key_filters[self.current_filter_idx].end
{
// [filter region] [filter region] [filter region]
// ^ item
@@ -71,7 +68,7 @@ impl<'a> FilterIterator<'a> {
// ^ current filter (nothing)
return Ok(None);
}
if self.retain_key_filters[self.current_filter_idx].contains(&item.key_lsn_value().0) {
if self.retain_key_filters[self.current_filter_idx].contains(&item.0) {
// [filter region] [filter region] [filter region]
// ^ item
// ^ current filter
@@ -84,16 +81,6 @@ impl<'a> FilterIterator<'a> {
}
Ok(None)
}
pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
self.next_inner().await
}
pub async fn next_with_trace(
&mut self,
) -> anyhow::Result<Option<((Key, Lsn, Value), Arc<PersistentLayerKey>)>> {
self.next_inner().await
}
}
#[cfg(test)]

View File

@@ -1,7 +1,6 @@
use std::{
cmp::Ordering,
collections::{binary_heap, BinaryHeap},
sync::Arc,
};
use anyhow::bail;
@@ -14,11 +13,10 @@ use pageserver_api::value::Value;
use super::{
delta_layer::{DeltaLayerInner, DeltaLayerIterator},
image_layer::{ImageLayerInner, ImageLayerIterator},
PersistentLayerDesc, PersistentLayerKey,
};
#[derive(Clone, Copy)]
pub(crate) enum LayerRef<'a> {
enum LayerRef<'a> {
Image(&'a ImageLayerInner),
Delta(&'a DeltaLayerInner),
}
@@ -64,20 +62,18 @@ impl LayerIterRef<'_> {
/// 1. Unified iterator for image and delta layers.
/// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
/// 3. Lazy creation of the real delta/image iterator.
pub(crate) enum IteratorWrapper<'a> {
enum IteratorWrapper<'a> {
NotLoaded {
ctx: &'a RequestContext,
first_key_lower_bound: (Key, Lsn),
layer: LayerRef<'a>,
source_desc: Arc<PersistentLayerKey>,
},
Loaded {
iter: PeekableLayerIterRef<'a>,
source_desc: Arc<PersistentLayerKey>,
},
}
pub(crate) struct PeekableLayerIterRef<'a> {
struct PeekableLayerIterRef<'a> {
iter: LayerIterRef<'a>,
peeked: Option<(Key, Lsn, Value)>, // None == end
}
@@ -155,12 +151,6 @@ impl<'a> IteratorWrapper<'a> {
layer: LayerRef::Image(image_layer),
first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()),
ctx,
source_desc: PersistentLayerKey {
key_range: image_layer.key_range().clone(),
lsn_range: PersistentLayerDesc::image_layer_lsn_range(image_layer.lsn()),
is_delta: false,
}
.into(),
}
}
@@ -172,18 +162,12 @@ impl<'a> IteratorWrapper<'a> {
layer: LayerRef::Delta(delta_layer),
first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start),
ctx,
source_desc: PersistentLayerKey {
key_range: delta_layer.key_range().clone(),
lsn_range: delta_layer.lsn_range().clone(),
is_delta: true,
}
.into(),
}
}
fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> {
match self {
Self::Loaded { iter, .. } => iter
Self::Loaded { iter } => iter
.peek()
.as_ref()
.map(|(key, lsn, val)| (key, *lsn, Some(val))),
@@ -207,7 +191,6 @@ impl<'a> IteratorWrapper<'a> {
ctx,
first_key_lower_bound,
layer,
source_desc,
} = self
else {
unreachable!()
@@ -223,10 +206,7 @@ impl<'a> IteratorWrapper<'a> {
);
}
}
*self = Self::Loaded {
iter,
source_desc: source_desc.clone(),
};
*self = Self::Loaded { iter };
Ok(())
}
@@ -240,19 +220,11 @@ impl<'a> IteratorWrapper<'a> {
/// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and
/// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
let Self::Loaded { iter, .. } = self else {
let Self::Loaded { iter } = self else {
panic!("must load the iterator before using")
};
iter.next().await
}
/// Get the persistent layer key corresponding to this iterator
fn trace_source(&self) -> Arc<PersistentLayerKey> {
match self {
Self::Loaded { source_desc, .. } => source_desc.clone(),
Self::NotLoaded { source_desc, .. } => source_desc.clone(),
}
}
}
/// A merge iterator over delta/image layer iterators.
@@ -270,32 +242,6 @@ pub struct MergeIterator<'a> {
heap: BinaryHeap<IteratorWrapper<'a>>,
}
pub(crate) trait MergeIteratorItem {
fn new(item: (Key, Lsn, Value), iterator: &IteratorWrapper<'_>) -> Self;
fn key_lsn_value(&self) -> &(Key, Lsn, Value);
}
impl MergeIteratorItem for (Key, Lsn, Value) {
fn new(item: (Key, Lsn, Value), _: &IteratorWrapper<'_>) -> Self {
item
}
fn key_lsn_value(&self) -> &(Key, Lsn, Value) {
self
}
}
impl MergeIteratorItem for ((Key, Lsn, Value), Arc<PersistentLayerKey>) {
fn new(item: (Key, Lsn, Value), iter: &IteratorWrapper<'_>) -> Self {
(item, iter.trace_source().clone())
}
fn key_lsn_value(&self) -> &(Key, Lsn, Value) {
&self.0
}
}
impl<'a> MergeIterator<'a> {
pub fn create(
deltas: &[&'a DeltaLayerInner],
@@ -314,7 +260,7 @@ impl<'a> MergeIterator<'a> {
}
}
pub(crate) async fn next_inner<R: MergeIteratorItem>(&mut self) -> anyhow::Result<Option<R>> {
pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
while let Some(mut iter) = self.heap.peek_mut() {
if !iter.is_loaded() {
// Once we load the iterator, we can know the real first key-value pair in the iterator.
@@ -329,22 +275,10 @@ impl<'a> MergeIterator<'a> {
binary_heap::PeekMut::pop(iter);
continue;
};
return Ok(Some(R::new(item, &iter)));
return Ok(Some(item));
}
Ok(None)
}
/// Get the next key-value pair from the iterator.
pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
self.next_inner().await
}
/// Get the next key-value pair from the iterator, and trace where the key comes from.
pub async fn next_with_trace(
&mut self,
) -> anyhow::Result<Option<((Key, Lsn, Value), Arc<PersistentLayerKey>)>> {
self.next_inner().await
}
}
#[cfg(test)]

View File

@@ -3531,7 +3531,7 @@ impl Timeline {
self.get_compaction_threshold(),
DEFAULT_COMPACTION_THRESHOLD,
)
&& frozen_layer_total_size >= /* 128 MB */ 128000000
&& frozen_layer_total_size >= /* 64 MB */ 64000000
{
tracing::warn!(
"too many frozen layers: {num_frozen_layers} layers with estimated in-mem size of {frozen_layer_total_size} bytes",

View File

@@ -4,7 +4,7 @@
//!
//! The old legacy algorithm is implemented directly in `timeline.rs`.
use std::collections::{BinaryHeap, HashMap, HashSet};
use std::collections::{BinaryHeap, HashSet};
use std::ops::{Deref, Range};
use std::sync::Arc;
@@ -56,7 +56,7 @@ use pageserver_api::value::Value;
use utils::lsn::Lsn;
use pageserver_compaction::helpers::{fully_contains, overlaps_with};
use pageserver_compaction::helpers::overlaps_with;
use pageserver_compaction::interface::*;
use super::CompactionError;
@@ -64,23 +64,6 @@ use super::CompactionError;
/// Maximum number of deltas before generating an image layer in bottom-most compaction.
const COMPACTION_DELTA_THRESHOLD: usize = 5;
pub struct GcCompactionJobDescription {
/// All layers to read in the compaction job
selected_layers: Vec<Layer>,
/// GC cutoff of the job
gc_cutoff: Lsn,
/// LSNs to retain for the job
retain_lsns_below_horizon: Vec<Lsn>,
/// Maximum layer LSN processed in this compaction
max_layer_lsn: Lsn,
/// Only compact layers overlapping with this range
compaction_key_range: Range<Key>,
/// When partial compaction is enabled, these layers need to be rewritten to ensure no overlap.
/// This field is here solely for debugging. The field will not be read once the compaction
/// description is generated.
rewrite_layers: Vec<Arc<PersistentLayerDesc>>,
}
/// The result of bottom-most compaction for a single key at each LSN.
#[derive(Debug)]
#[cfg_attr(test, derive(PartialEq))]
@@ -1739,8 +1722,7 @@ impl Timeline {
flags: EnumSet<CompactFlags>,
ctx: &RequestContext,
) -> anyhow::Result<()> {
self.partial_compact_with_gc(Key::MIN..Key::MAX, cancel, flags, ctx)
.await
self.partial_compact_with_gc(None, cancel, flags, ctx).await
}
/// An experimental compaction building block that combines compaction with garbage collection.
@@ -1750,15 +1732,12 @@ impl Timeline {
/// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
/// and create delta layers with all deltas >= gc horizon.
///
/// If `key_range` is provided, it will only compact the keys within the range, aka partial compaction.
/// Partial compaction will read and process all layers overlapping with the key range, even if it might
/// contain extra keys. After the gc-compaction phase completes, delta layers that are not fully contained
/// within the key range will be rewritten to ensure they do not overlap with the delta layers. Providing
/// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
/// part of the range.
/// If `key_range`, it will only compact the keys within the range, aka partial compaction. This functionality
/// is not complete yet, and if it is set, only image layers will be generated.
///
pub(crate) async fn partial_compact_with_gc(
self: &Arc<Self>,
compaction_key_range: Range<Key>,
compaction_key_range: Option<Range<Key>>,
cancel: &CancellationToken,
flags: EnumSet<CompactFlags>,
ctx: &RequestContext,
@@ -1783,8 +1762,9 @@ impl Timeline {
.await?;
let dry_run = flags.contains(CompactFlags::DryRun);
let partial_compaction = compaction_key_range.is_some();
if compaction_key_range == (Key::MIN..Key::MAX) {
if let Some(ref compaction_key_range) = compaction_key_range {
info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compaction_key_range={}..{}", compaction_key_range.start, compaction_key_range.end);
} else {
info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
@@ -1800,7 +1780,7 @@ impl Timeline {
// The layer selection has the following properties:
// 1. If a layer is in the selection, all layers below it are in the selection.
// 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
let job_desc = {
let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = if !partial_compaction {
let guard = self.layers.read().await;
let layers = guard.layer_map()?;
let gc_info = self.gc_info.read().unwrap();
@@ -1830,21 +1810,9 @@ impl Timeline {
};
// Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
// layers to compact.
let mut rewrite_layers = Vec::new();
for desc in layers.iter_historic_layers() {
if desc.get_lsn_range().end <= max_layer_lsn
&& overlaps_with(&desc.get_key_range(), &compaction_key_range)
{
// If the layer overlaps with the compaction key range, we need to read it to obtain all keys within the range,
// even if it might contain extra keys
if desc.get_lsn_range().end <= max_layer_lsn {
selected_layers.push(guard.get_from_desc(&desc));
// If the layer is not fully contained within the key range, we need to rewrite it if it's a delta layer (it's fine
// to overlap image layers)
if desc.is_delta()
&& !fully_contains(&compaction_key_range, &desc.get_key_range())
{
rewrite_layers.push(desc);
}
}
}
if selected_layers.is_empty() {
@@ -1852,59 +1820,82 @@ impl Timeline {
return Ok(());
}
retain_lsns_below_horizon.sort();
GcCompactionJobDescription {
selected_layers,
gc_cutoff,
retain_lsns_below_horizon,
max_layer_lsn,
compaction_key_range,
rewrite_layers,
(selected_layers, gc_cutoff, retain_lsns_below_horizon)
} else {
// In case of partial compaction, we currently only support generating image layers, and therefore,
// we pick all layers that are below the lowest retain_lsn and does not intersect with any of the layers.
let guard = self.layers.read().await;
let layers = guard.layer_map()?;
let gc_info = self.gc_info.read().unwrap();
let mut min_lsn = gc_info.cutoffs.select_min();
for (lsn, _, _) in &gc_info.retain_lsns {
if lsn < &min_lsn {
min_lsn = *lsn;
}
}
for lsn in gc_info.leases.keys() {
if lsn < &min_lsn {
min_lsn = *lsn;
}
}
let mut selected_layers = Vec::new();
drop(gc_info);
// |-------| |-------| |-------|
// | Delta | | Delta | | Delta | -- min_lsn could be intersecting with the layers
// |-------| |-------| |-------| <- we want to pick all the layers below min_lsn, so that
// | Delta | | Delta | | Delta | ...we can remove them after compaction
// |-------| |-------| |-------|
// Pick all the layers intersect or below the min_lsn, get the largest LSN in the selected layers.
let Some(compaction_key_range) = compaction_key_range.as_ref() else {
unreachable!()
};
for desc in layers.iter_historic_layers() {
if desc.get_lsn_range().end <= min_lsn
&& overlaps_with(&desc.key_range, compaction_key_range)
{
selected_layers.push(guard.get_from_desc(&desc));
}
}
if selected_layers.is_empty() {
info!("no layers to compact with gc");
return Ok(());
}
(selected_layers, min_lsn, Vec::new())
};
let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
if partial_compaction {
warn!("partial compaction cannot run on child branches (for now)");
return Ok(());
}
Lsn(self.ancestor_lsn.0 + 1)
} else {
let res = job_desc
.retain_lsns_below_horizon
let res = retain_lsns_below_horizon
.first()
.copied()
.unwrap_or(job_desc.gc_cutoff);
.unwrap_or(gc_cutoff);
if cfg!(debug_assertions) {
assert_eq!(
res,
job_desc
.retain_lsns_below_horizon
retain_lsns_below_horizon
.iter()
.min()
.copied()
.unwrap_or(job_desc.gc_cutoff)
.unwrap_or(gc_cutoff)
);
}
res
};
info!(
"picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}",
job_desc.selected_layers.len(),
job_desc.rewrite_layers.len(),
job_desc.max_layer_lsn,
job_desc.gc_cutoff,
lowest_retain_lsn,
job_desc.compaction_key_range.start,
job_desc.compaction_key_range.end
"picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}",
layer_selection.len(),
gc_cutoff,
lowest_retain_lsn
);
for layer in &job_desc.selected_layers {
debug!("read layer: {}", layer.layer_desc().key());
}
for layer in &job_desc.rewrite_layers {
debug!("rewrite layer: {}", layer.key());
}
self.check_compaction_space(&job_desc.selected_layers)
.await?;
self.check_compaction_space(&layer_selection).await?;
// Generate statistics for the compaction
for layer in &job_desc.selected_layers {
for layer in &layer_selection {
let desc = layer.layer_desc();
if desc.is_delta() {
stat.visit_delta_layer(desc.file_size());
@@ -1915,25 +1906,25 @@ impl Timeline {
// Step 1: construct a k-merge iterator over all layers.
// Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
let layer_names = job_desc
.selected_layers
let layer_names: Vec<crate::tenant::storage_layer::LayerName> = layer_selection
.iter()
.map(|layer| layer.layer_desc().layer_name())
.collect_vec();
if let Some(err) = check_valid_layermap(&layer_names) {
warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
bail!("cannot run gc-compaction because {}", err);
}
// The maximum LSN we are processing in this compaction loop
let end_lsn = job_desc
.selected_layers
let end_lsn = layer_selection
.iter()
.map(|l| l.layer_desc().lsn_range.end)
.max()
.unwrap();
// We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
// as an L0 layer.
let mut delta_layers = Vec::new();
let mut image_layers = Vec::new();
let mut downloaded_layers = Vec::new();
for layer in &job_desc.selected_layers {
for layer in &layer_selection {
let resident_layer = layer.download_and_keep_resident().await?;
downloaded_layers.push(resident_layer);
}
@@ -1952,8 +1943,8 @@ impl Timeline {
dense_ks,
sparse_ks,
)?;
// Step 2: Produce images+deltas.
// Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
// Data of the same key.
let mut accumulated_values = Vec::new();
let mut last_key: Option<Key> = None;
@@ -1965,7 +1956,10 @@ impl Timeline {
self.conf,
self.timeline_id,
self.tenant_shard_id,
job_desc.compaction_key_range.start,
compaction_key_range
.as_ref()
.map(|x| x.start)
.unwrap_or(Key::MIN),
lowest_retain_lsn,
self.get_compaction_target_size(),
ctx,
@@ -1985,13 +1979,6 @@ impl Timeline {
)
.await?;
#[derive(Default)]
struct RewritingLayers {
before: Option<DeltaLayerWriter>,
after: Option<DeltaLayerWriter>,
}
let mut delta_layer_rewriters = HashMap::<Arc<PersistentLayerKey>, RewritingLayers>::new();
/// Returns None if there is no ancestor branch. Throw an error when the key is not found.
///
/// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
@@ -2017,51 +2004,10 @@ impl Timeline {
// the key and LSN range are determined. However, to keep things simple here, we still
// create this writer, and discard the writer in the end.
while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? {
while let Some((key, lsn, val)) = merge_iter.next().await? {
if cancel.is_cancelled() {
return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
}
if !job_desc.compaction_key_range.contains(&key) {
if !desc.is_delta {
continue;
}
let rewriter = delta_layer_rewriters.entry(desc.clone()).or_default();
let rewriter = if key < job_desc.compaction_key_range.start {
if rewriter.before.is_none() {
rewriter.before = Some(
DeltaLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_shard_id,
desc.key_range.start,
desc.lsn_range.clone(),
ctx,
)
.await?,
);
}
rewriter.before.as_mut().unwrap()
} else if key >= job_desc.compaction_key_range.end {
if rewriter.after.is_none() {
rewriter.after = Some(
DeltaLayerWriter::new(
self.conf,
self.timeline_id,
self.tenant_shard_id,
job_desc.compaction_key_range.end,
desc.lsn_range.clone(),
ctx,
)
.await?,
);
}
rewriter.after.as_mut().unwrap()
} else {
unreachable!()
};
rewriter.put_value(key, lsn, val, ctx).await?;
continue;
}
match val {
Value::Image(_) => stat.visit_image_key(&val),
Value::WalRecord(_) => stat.visit_wal_key(&val),
@@ -2072,27 +2018,35 @@ impl Timeline {
}
accumulated_values.push((key, lsn, val));
} else {
let last_key: &mut Key = last_key.as_mut().unwrap();
stat.on_unique_key_visited(); // TODO: adjust statistics for partial compaction
let retention = self
.generate_key_retention(
*last_key,
&accumulated_values,
job_desc.gc_cutoff,
&job_desc.retain_lsns_below_horizon,
COMPACTION_DELTA_THRESHOLD,
get_ancestor_image(self, *last_key, ctx).await?,
)
.await?;
retention
.pipe_to(
*last_key,
&mut delta_layer_writer,
image_layer_writer.as_mut(),
&mut stat,
ctx,
)
.await?;
let last_key = last_key.as_mut().unwrap();
stat.on_unique_key_visited();
let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
!compaction_key_range.contains(last_key)
} else {
false
};
if !skip_adding_key {
let retention = self
.generate_key_retention(
*last_key,
&accumulated_values,
gc_cutoff,
&retain_lsns_below_horizon,
COMPACTION_DELTA_THRESHOLD,
get_ancestor_image(self, *last_key, ctx).await?,
)
.await?;
// Put the image into the image layer. Currently we have a single big layer for the compaction.
retention
.pipe_to(
*last_key,
&mut delta_layer_writer,
image_layer_writer.as_mut(),
&mut stat,
ctx,
)
.await?;
}
accumulated_values.clear();
*last_key = key;
accumulated_values.push((key, lsn, val));
@@ -2103,42 +2057,34 @@ impl Timeline {
let last_key = last_key.expect("no keys produced during compaction");
stat.on_unique_key_visited();
let retention = self
.generate_key_retention(
last_key,
&accumulated_values,
job_desc.gc_cutoff,
&job_desc.retain_lsns_below_horizon,
COMPACTION_DELTA_THRESHOLD,
get_ancestor_image(self, last_key, ctx).await?,
)
.await?;
retention
.pipe_to(
last_key,
&mut delta_layer_writer,
image_layer_writer.as_mut(),
&mut stat,
ctx,
)
.await?;
// end: move the above part to the loop body
let mut rewrote_delta_layers = Vec::new();
for (key, writers) in delta_layer_rewriters {
if let Some(delta_writer_before) = writers.before {
let (desc, path) = delta_writer_before
.finish(job_desc.compaction_key_range.start, ctx)
.await?;
let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
rewrote_delta_layers.push(layer);
}
if let Some(delta_writer_after) = writers.after {
let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?;
let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
rewrote_delta_layers.push(layer);
}
let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
!compaction_key_range.contains(&last_key)
} else {
false
};
if !skip_adding_key {
let retention = self
.generate_key_retention(
last_key,
&accumulated_values,
gc_cutoff,
&retain_lsns_below_horizon,
COMPACTION_DELTA_THRESHOLD,
get_ancestor_image(self, last_key, ctx).await?,
)
.await?;
// Put the image into the image layer. Currently we have a single big layer for the compaction.
retention
.pipe_to(
last_key,
&mut delta_layer_writer,
image_layer_writer.as_mut(),
&mut stat,
ctx,
)
.await?;
}
// end: move the above part to the loop body
let discard = |key: &PersistentLayerKey| {
let key = key.clone();
@@ -2147,7 +2093,10 @@ impl Timeline {
let produced_image_layers = if let Some(writer) = image_layer_writer {
if !dry_run {
let end_key = job_desc.compaction_key_range.end;
let end_key = compaction_key_range
.as_ref()
.map(|x| x.end)
.unwrap_or(Key::MAX);
writer
.finish_with_discard_fn(self, ctx, end_key, discard)
.await?
@@ -2168,8 +2117,10 @@ impl Timeline {
Vec::new()
};
// TODO: make image/delta/rewrote_delta layers generation atomic. At this point, we already generated resident layers, and if
// compaction is cancelled at this point, we might have some layers that are not cleaned up.
if partial_compaction && !produced_delta_layers.is_empty() {
bail!("implementation error: partial compaction should not be producing delta layers (for now)");
}
let mut compact_to = Vec::new();
let mut keep_layers = HashSet::new();
let produced_delta_layers_len = produced_delta_layers.len();
@@ -2177,83 +2128,51 @@ impl Timeline {
for action in produced_delta_layers {
match action {
BatchWriterResult::Produced(layer) => {
if cfg!(debug_assertions) {
info!("produced delta layer: {}", layer.layer_desc().key());
}
stat.produce_delta_layer(layer.layer_desc().file_size());
compact_to.push(layer);
}
BatchWriterResult::Discarded(l) => {
if cfg!(debug_assertions) {
info!("discarded delta layer: {}", l);
}
keep_layers.insert(l);
stat.discard_delta_layer();
}
}
}
for layer in &rewrote_delta_layers {
debug!(
"produced rewritten delta layer: {}",
layer.layer_desc().key()
);
}
compact_to.extend(rewrote_delta_layers);
for action in produced_image_layers {
match action {
BatchWriterResult::Produced(layer) => {
debug!("produced image layer: {}", layer.layer_desc().key());
stat.produce_image_layer(layer.layer_desc().file_size());
compact_to.push(layer);
}
BatchWriterResult::Discarded(l) => {
debug!("discarded image layer: {}", l);
keep_layers.insert(l);
stat.discard_image_layer();
}
}
}
let mut layer_selection = job_desc.selected_layers;
// Partial compaction might select more data than it processes, e.g., if
// the compaction_key_range only partially overlaps:
//
// [---compaction_key_range---]
// [---A----][----B----][----C----][----D----]
//
// For delta layers, we will rewrite the layers so that it is cut exactly at
// the compaction key range, so we can always discard them. However, for image
// layers, as we do not rewrite them for now, we need to handle them differently.
// Assume image layers A, B, C, D are all in the `layer_selection`.
//
// The created image layers contain whatever is needed from B, C, and from
// `----]` of A, and from `[---` of D.
//
// In contrast, `[---A` and `D----]` have not been processed, so, we must
// keep that data.
//
// The solution for now is to keep A and D completely if they are image layers.
// (layer_selection is what we'll remove from the layer map, so, retain what
// is _not_ fully covered by compaction_key_range).
for layer in &layer_selection {
if !layer.layer_desc().is_delta() {
if !overlaps_with(
&layer.layer_desc().key_range,
&job_desc.compaction_key_range,
) {
bail!("violated constraint: image layer outside of compaction key range");
}
if !fully_contains(
&job_desc.compaction_key_range,
&layer.layer_desc().key_range,
) {
keep_layers.insert(layer.layer_desc().key());
}
}
}
let mut layer_selection = layer_selection;
layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
if let Some(ref compaction_key_range) = compaction_key_range {
// Partial compaction might select more data than it processes, e.g., if
// the compaction_key_range only partially overlaps:
//
// [---compaction_key_range---]
// [---A----][----B----][----C----][----D----]
//
// A,B,C,D are all in the `layer_selection`. The created image layers contain
// whatever is needed from B, C, and from `----]` of A, and from `[--` of D.
//
// In contrast, `[--A-` and `--D----]` have not been processed, so, we must
// keep that data.
//
// The solution for now is to keep A and D completely.
// (layer_selection is what we'll remove from the layer map, so,
// retain what is _not_ fully covered by compaction_key_range).
layer_selection.retain(|x| {
let key_range = &x.layer_desc().key_range;
key_range.start >= compaction_key_range.start
&& key_range.end <= compaction_key_range.end
});
}
info!(
"gc-compaction statistics: {}",
@@ -2273,7 +2192,6 @@ impl Timeline {
// Step 3: Place back to the layer map.
{
// TODO: sanity check if the layer map is valid (i.e., should not have overlaps)
let mut guard = self.layers.write().await;
guard
.open_mut()?

View File

@@ -20,8 +20,7 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
| Scope::PageServerApi
| Scope::GenerationsApi
| Scope::Infra
| Scope::Scrubber
| Scope::ControllerPeer,
| Scope::Scrubber,
_,
) => Err(AuthError(
format!(

View File

@@ -55,7 +55,7 @@ pub static WRITE_WAL_SECONDS: Lazy<Histogram> = Lazy::new(|| {
pub static FLUSH_WAL_SECONDS: Lazy<Histogram> = Lazy::new(|| {
register_histogram!(
"safekeeper_flush_wal_seconds",
"Seconds spent syncing WAL to a disk (excluding segment initialization)",
"Seconds spent syncing WAL to a disk",
DISK_FSYNC_SECONDS_BUCKETS.to_vec()
)
.expect("Failed to register safekeeper_flush_wal_seconds histogram")

View File

@@ -562,6 +562,9 @@ impl WalAcceptor {
// Don't flush the WAL on every append, only periodically via flush_ticker.
// This batches multiple appends per fsync. If the channel is empty after
// sending the reply, we'll schedule an immediate flush.
//
// Note that a flush can still happen on segment bounds, which will result
// in an AppendResponse.
if let ProposerAcceptorMessage::AppendRequest(append_request) = msg {
msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
dirty = true;

View File

@@ -947,6 +947,7 @@ where
// while first connection still gets some packets later. It might be
// better to not log this as error! above.
let write_lsn = self.wal_store.write_lsn();
let flush_lsn = self.wal_store.flush_lsn();
if write_lsn > msg.h.begin_lsn {
bail!(
"append request rewrites WAL written before, write_lsn={}, msg lsn={}",
@@ -1012,7 +1013,9 @@ where
);
// If flush_lsn hasn't updated, AppendResponse is not very useful.
if !require_flush {
// This is the common case for !require_flush, but a flush can still
// happen on segment bounds.
if flush_lsn == self.flush_lsn() {
return Ok(None);
}

View File

@@ -12,7 +12,7 @@ use bytes::Bytes;
use camino::{Utf8Path, Utf8PathBuf};
use futures::future::BoxFuture;
use postgres_ffi::v14::xlog_utils::{IsPartialXLogFileName, IsXLogFileName, XLogFromFileName};
use postgres_ffi::{dispatch_pgversion, XLogSegNo, PG_TLI};
use postgres_ffi::{dispatch_pgversion, XLogRecord, XLogSegNo, PG_TLI, XLOG_SIZE_OF_XLOG_RECORD};
use remote_storage::RemotePath;
use std::cmp::{max, min};
use std::future::Future;
@@ -113,6 +113,13 @@ pub struct PhysicalStorage {
/// non-aligned chunks of data.
write_record_lsn: Lsn,
/// The last LSN flushed to disk. May be in the middle of a record.
///
/// NB: when the rest of the system refers to `flush_lsn`, it usually
/// actually refers to `flush_record_lsn`. This ambiguity can be dangerous
/// and should be resolved.
flush_lsn: Lsn,
/// The LSN of the last WAL record flushed to disk.
flush_record_lsn: Lsn,
@@ -205,6 +212,7 @@ impl PhysicalStorage {
system_id: state.server.system_id,
write_lsn,
write_record_lsn: write_lsn,
flush_lsn,
flush_record_lsn: flush_lsn,
decoder: WalStreamDecoder::new(write_lsn, state.server.pg_version / 10000),
file: None,
@@ -257,9 +265,6 @@ impl PhysicalStorage {
// Try to open existing partial file
Ok((file, true))
} else {
let _timer = WAL_STORAGE_OPERATION_SECONDS
.with_label_values(&["initialize_segment"])
.start_timer();
// Create and fill new partial file
//
// We're using fdatasync during WAL writing, so file size must not
@@ -277,6 +282,8 @@ impl PhysicalStorage {
});
file.set_len(self.wal_seg_size as u64).await?;
// Note: this doesn't get into observe_flush_seconds metric. But
// segment init should be separate metric, if any.
if let Err(e) = durable_rename(&tmp_path, &wal_file_partial_path, !self.no_sync).await {
// Probably rename succeeded, but fsync of it failed. Remove
// the file then to avoid using it.
@@ -289,8 +296,9 @@ impl PhysicalStorage {
}
}
/// Write WAL bytes, which are known to be located in a single WAL segment.
async fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<()> {
/// Write WAL bytes, which are known to be located in a single WAL segment. Returns true if the
/// segment was completed, closed, and flushed to disk.
async fn write_in_segment(&mut self, segno: u64, xlogoff: usize, buf: &[u8]) -> Result<bool> {
let mut file = if let Some(file) = self.file.take() {
file
} else {
@@ -314,20 +322,24 @@ impl PhysicalStorage {
let (wal_file_path, wal_file_partial_path) =
wal_file_paths(&self.timeline_dir, segno, self.wal_seg_size);
fs::rename(wal_file_partial_path, wal_file_path).await?;
Ok(true)
} else {
// otherwise, file can be reused later
self.file = Some(file);
Ok(false)
}
Ok(())
}
/// Writes WAL to the segment files, until everything is writed. If some segments
/// are fully written, they are flushed to disk. The last (partial) segment can
/// be flushed separately later.
///
/// Updates `write_lsn`.
/// Updates `write_lsn` and `flush_lsn`.
async fn write_exact(&mut self, pos: Lsn, mut buf: &[u8]) -> Result<()> {
// TODO: this shouldn't be possible, except possibly with write_lsn == 0.
// Rename this method to `append_exact`, and make it append-only, removing
// the `pos` parameter and this check. For this reason, we don't update
// `flush_lsn` here.
if self.write_lsn != pos {
// need to flush the file before discarding it
if let Some(file) = self.file.take() {
@@ -349,9 +361,13 @@ impl PhysicalStorage {
buf.len()
};
self.write_in_segment(segno, xlogoff, &buf[..bytes_write])
let flushed = self
.write_in_segment(segno, xlogoff, &buf[..bytes_write])
.await?;
self.write_lsn += bytes_write as u64;
if flushed {
self.flush_lsn = self.write_lsn;
}
buf = &buf[bytes_write..];
}
@@ -365,6 +381,9 @@ impl Storage for PhysicalStorage {
self.write_lsn
}
/// flush_lsn returns LSN of last durably stored WAL record.
///
/// TODO: flush_lsn() returns flush_record_lsn, but write_lsn() returns write_lsn: confusing.
#[allow(clippy::misnamed_getters)]
fn flush_lsn(&self) -> Lsn {
self.flush_record_lsn
}
@@ -411,8 +430,9 @@ impl Storage for PhysicalStorage {
self.metrics.observe_write_seconds(write_seconds);
self.metrics.observe_write_bytes(buf.len());
// figure out last record's end lsn for reporting (if we got the
// whole record)
// Figure out the last record's end LSN and update `write_record_lsn`
// (if we got a whole record). The write may also have closed and
// flushed a segment, so update `flush_record_lsn` as well.
if self.decoder.available() != startpos {
info!(
"restart decoder from {} to {}",
@@ -423,13 +443,28 @@ impl Storage for PhysicalStorage {
self.decoder = WalStreamDecoder::new(startpos, pg_version);
}
self.decoder.feed_bytes(buf);
loop {
match self.decoder.poll_decode()? {
None => break, // no full record yet
Some((lsn, _rec)) => {
self.write_record_lsn = lsn;
}
if self.write_record_lsn <= self.flush_lsn {
// We may have flushed a previously written record.
self.flush_record_lsn = self.write_record_lsn;
}
let mut xact_commit = false;
while let Some((lsn, rec)) = self.decoder.poll_decode()? {
self.write_record_lsn = lsn;
if lsn <= self.flush_lsn {
self.flush_record_lsn = lsn;
}
// TODO: the decoder already has the record header, make it return it.
let header = XLogRecord::from_slice(&rec[0..XLOG_SIZE_OF_XLOG_RECORD])
.expect("invalid record header");
xact_commit = xact_commit || header.is_xact_commit();
}
// If a transaction committed, flush the WAL. This will emit an AppendResponse to the
// compute. Otherwise, with pipelined ingestion, the txn may have to wait until the next
// periodic flush in 1 second, causing commit latency.
if xact_commit {
self.flush_wal().await?;
}
Ok(())
@@ -445,19 +480,17 @@ impl Storage for PhysicalStorage {
self.fdatasync_file(&unflushed_file).await?;
self.file = Some(unflushed_file);
} else {
// We have unflushed data (write_lsn != flush_lsn), but no file.
// This should only happen if last file was fully written and flushed,
// but haven't updated flush_lsn yet.
if self.write_lsn.segment_offset(self.wal_seg_size) != 0 {
bail!(
"unexpected unflushed data with no open file, write_lsn={}, flush_lsn={}",
self.write_lsn,
self.flush_record_lsn
);
}
// We have unflushed data (write_lsn != flush_lsn), but no file. This
// shouldn't happen, since the segment is flushed on close.
bail!(
"unexpected unflushed data with no open file, write_lsn={}, flush_lsn={}",
self.write_lsn,
self.flush_record_lsn
);
}
// everything is flushed now, let's update flush_lsn
self.flush_lsn = self.write_lsn;
self.flush_record_lsn = self.write_record_lsn;
Ok(())
}
@@ -516,6 +549,7 @@ impl Storage for PhysicalStorage {
// Update LSNs
self.write_lsn = end_pos;
self.write_record_lsn = end_pos;
self.flush_lsn = end_pos;
self.flush_record_lsn = end_pos;
self.is_truncated_after_restart = true;
Ok(())

View File

@@ -1033,7 +1033,7 @@ async fn handle_update_preferred_azs(req: Request<Body>) -> Result<Response<Body
}
async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
check_permissions(&req, Scope::ControllerPeer)?;
check_permissions(&req, Scope::Admin)?;
let req = match maybe_forward(req).await {
ForwardOutcome::Forwarded(res) => {

View File

@@ -286,7 +286,7 @@ class PgProtocol:
return self.safe_psql_many([query], **kwargs)[0]
def safe_psql_many(
self, queries: Iterable[str], log_query: bool = True, **kwargs: Any
self, queries: Iterable[str], log_query=True, **kwargs: Any
) -> list[list[tuple[Any, ...]]]:
"""
Execute queries against the node and return all rows.
@@ -306,7 +306,7 @@ class PgProtocol:
result.append(cur.fetchall())
return result
def safe_psql_scalar(self, query: str, log_query: bool = True) -> Any:
def safe_psql_scalar(self, query, log_query=True) -> Any:
"""
Execute query returning single row with single column.
"""

View File

@@ -93,8 +93,6 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
".*WARN.*path=/v1/utilization .*request was dropped before completing",
# Can happen during shutdown
".*scheduling deletion on drop failed: queue is in state Stopped.*",
# Too many frozen layers error is normal during intensive benchmarks
".*too many frozen layers.*",
)

View File

@@ -1,8 +1,10 @@
from __future__ import annotations
import enum
import os
from typing import TYPE_CHECKING
import pytest
from typing_extensions import override
if TYPE_CHECKING:
@@ -16,15 +18,12 @@ This fixture is used to determine which version of Postgres to use for tests.
# Inherit PgVersion from str rather than int to make it easier to pass as a command-line argument
# TODO: use enum.StrEnum for Python >= 3.11
@enum.unique
class PgVersion(str, enum.Enum):
V14 = "14"
V15 = "15"
V16 = "16"
V17 = "17"
# Default Postgres Version for tests that don't really depend on Postgres itself
DEFAULT = V16
# Instead of making version an optional parameter in methods, we can use this fake entry
# to explicitly rely on the default server version (could be different from pg_version fixture value)
NOT_SET = "<-POSTRGRES VERSION IS NOT SET->"
@@ -60,3 +59,27 @@ class PgVersion(str, enum.Enum):
# Make mypy happy
# See https://github.com/python/mypy/issues/3974
return None
DEFAULT_VERSION: PgVersion = PgVersion.V16
def skip_on_postgres(version: PgVersion, reason: str):
return pytest.mark.skipif(
PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version,
reason=reason,
)
def xfail_on_postgres(version: PgVersion, reason: str):
return pytest.mark.xfail(
PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version,
reason=reason,
)
def run_only_on_default_postgres(reason: str):
return pytest.mark.skipif(
PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is not DEFAULT_VERSION,
reason=reason,
)

View File

@@ -25,7 +25,6 @@ from fixtures.pageserver.common_types import (
parse_delta_layer,
parse_image_layer,
)
from fixtures.pg_version import PgVersion
if TYPE_CHECKING:
from collections.abc import Iterable
@@ -38,7 +37,6 @@ if TYPE_CHECKING:
Fn = TypeVar("Fn", bound=Callable[..., Any])
COMPONENT_BINARIES = {
"storage_controller": ("storage_controller",),
"storage_broker": ("storage_broker",),
@@ -521,7 +519,7 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str
This is essentially:
lines=$(comm -3 \
<(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \
<(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \
<(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \
| wc -l)
[ "$lines" = "0" ]
@@ -645,40 +643,3 @@ def allpairs_versions():
)
ids.append(f"combination_{''.join(cur_id)}")
return {"argnames": "combination", "argvalues": tuple(argvalues), "ids": ids}
def skip_on_postgres(version: PgVersion, reason: str):
return pytest.mark.skipif(
PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is version,
reason=reason,
)
def xfail_on_postgres(version: PgVersion, reason: str):
return pytest.mark.xfail(
PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is version,
reason=reason,
)
def run_only_on_default_postgres(reason: str):
return pytest.mark.skipif(
PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is not PgVersion.DEFAULT,
reason=reason,
)
def skip_in_debug_build(reason: str):
return pytest.mark.skipif(
os.getenv("BUILD_TYPE", "debug") == "debug",
reason=reason,
)
def skip_on_ci(reason: str):
# `CI` variable is always set to `true` on GitHub
# Ref: https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables
return pytest.mark.skipif(
os.getenv("CI", "false") == "true",
reason=reason,
)

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
import json
import os
from pathlib import Path
from typing import TYPE_CHECKING
@@ -13,7 +14,7 @@ from fixtures.neon_fixtures import (
PgBin,
wait_for_last_flush_lsn,
)
from fixtures.utils import get_scale_for_db, humantime_to_ms, skip_on_ci
from fixtures.utils import get_scale_for_db, humantime_to_ms
from performance.pageserver.util import (
setup_pageserver_with_tenants,
@@ -37,8 +38,9 @@ if TYPE_CHECKING:
@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
@pytest.mark.parametrize("n_tenants", [500])
@pytest.mark.timeout(10000)
@skip_on_ci(
"This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI"
@pytest.mark.skipif(
os.getenv("CI", "false") == "true",
reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
)
def test_pageserver_characterize_throughput_with_n_tenants(
neon_env_builder: NeonEnvBuilder,
@@ -64,8 +66,9 @@ def test_pageserver_characterize_throughput_with_n_tenants(
@pytest.mark.parametrize("n_clients", [1, 64])
@pytest.mark.parametrize("n_tenants", [1])
@pytest.mark.timeout(2400)
@skip_on_ci(
"This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI"
@pytest.mark.skipif(
os.getenv("CI", "false") == "true",
reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
)
def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant(
neon_env_builder: NeonEnvBuilder,

View File

@@ -8,7 +8,7 @@ from fixtures.common_types import Lsn, TimelineId
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv
from fixtures.pageserver.http import TimelineCreate406
from fixtures.utils import query_scalar, skip_in_debug_build
from fixtures.utils import query_scalar
# Test the GC implementation when running with branching.
@@ -48,8 +48,10 @@ from fixtures.utils import query_scalar, skip_in_debug_build
# Because the delta layer D covering lsn1 is corrupted, creating a branch
# starting from lsn1 should return an error as follows:
# could not find data for key ... at LSN ..., for request at LSN ...
@skip_in_debug_build("times out in debug builds")
def test_branch_and_gc(neon_simple_env: NeonEnv):
def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str):
if build_type == "debug":
pytest.skip("times out in debug builds")
env = neon_simple_env
pageserver_http_client = env.pageserver.http_client()

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
import enum
import json
import os
import time
from typing import TYPE_CHECKING
@@ -12,7 +13,7 @@ from fixtures.neon_fixtures import (
generate_uploads_and_deletions,
)
from fixtures.pageserver.http import PageserverApiException
from fixtures.utils import skip_in_debug_build, wait_until
from fixtures.utils import wait_until
from fixtures.workload import Workload
if TYPE_CHECKING:
@@ -31,7 +32,7 @@ AGGRESIVE_COMPACTION_TENANT_CONF = {
}
@skip_in_debug_build("only run with release build")
@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder):
"""
This is a smoke test that compaction kicks in. The workload repeatedly churns

View File

@@ -12,7 +12,6 @@ from fixtures.neon_fixtures import (
NeonEnvBuilder,
)
from fixtures.pg_version import PgVersion
from fixtures.utils import skip_on_postgres
from pytest_httpserver import HTTPServer
from werkzeug.wrappers.request import Request
from werkzeug.wrappers.response import Response
@@ -42,14 +41,17 @@ def neon_env_builder_local(
return neon_env_builder
@skip_on_postgres(PgVersion.V16, reason="TODO: PG16 extension building")
@skip_on_postgres(PgVersion.V17, reason="TODO: PG17 extension building")
def test_remote_extensions(
httpserver: HTTPServer,
neon_env_builder_local: NeonEnvBuilder,
httpserver_listen_address,
pg_version,
):
if pg_version == PgVersion.V16:
pytest.skip("TODO: PG16 extension building")
if pg_version == PgVersion.V17:
pytest.skip("TODO: PG17 extension building")
# setup mock http server
# that expects request for anon.tar.zst
# and returns the requested file

View File

@@ -4,22 +4,25 @@ from collections.abc import Iterable
from dataclasses import dataclass
from typing import TYPE_CHECKING
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
from fixtures.pageserver.http import HistoricLayerInfo, LayerMapInfo
from fixtures.utils import human_bytes, skip_in_debug_build
from fixtures.utils import human_bytes
if TYPE_CHECKING:
from typing import Union
@skip_in_debug_build("debug run is unnecessarily slow")
def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder):
def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder, build_type: str):
"""
Build a non-small GIN index which includes similarly batched up images in WAL stream as does pgvector
to show that we no longer create oversized layers.
"""
if build_type == "debug":
pytest.skip("debug run is unnecessarily slow")
minimum_initdb_size = 20 * 1024**2
checkpoint_distance = 32 * 1024**2
minimum_good_layer_size = checkpoint_distance * 0.9

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
import os
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
@@ -9,18 +10,12 @@ from fixtures.neon_fixtures import (
wait_for_last_flush_lsn,
)
from fixtures.pg_version import PgVersion
from fixtures.utils import skip_on_postgres
@skip_on_postgres(
PgVersion.V14,
reason="pg_log_standby_snapshot() function is available since Postgres 16",
)
@skip_on_postgres(
PgVersion.V15,
reason="pg_log_standby_snapshot() function is available since Postgres 16",
)
def test_layer_bloating(neon_env_builder: NeonEnvBuilder, vanilla_pg):
if neon_env_builder.pg_version != PgVersion.V16:
pytest.skip("pg_log_standby_snapshot() function is available only in PG16")
env = neon_env_builder.init_start(
initial_tenant_conf={
"gc_period": "0s",

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
import time
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
@@ -11,13 +12,17 @@ from fixtures.neon_fixtures import (
from fixtures.pageserver.common_types import parse_layer_file_name
from fixtures.pageserver.utils import wait_for_upload
from fixtures.remote_storage import RemoteStorageKind
from fixtures.utils import skip_in_debug_build
# Crates a few layers, ensures that we can evict them (removing locally but keeping track of them anyway)
# and then download them back.
@skip_in_debug_build("times out in debug builds")
def test_basic_eviction(neon_env_builder: NeonEnvBuilder):
def test_basic_eviction(
neon_env_builder: NeonEnvBuilder,
build_type: str,
):
if build_type == "debug":
pytest.skip("times out in debug builds")
neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
env = neon_env_builder.init_start(

View File

@@ -5,7 +5,8 @@ import uuid
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.utils import run_only_on_default_postgres, wait_until
from fixtures.pg_version import run_only_on_default_postgres
from fixtures.utils import wait_until
@pytest.mark.parametrize("level", ["trace", "debug", "info", "warn", "error"])

View File

@@ -4,31 +4,24 @@ import time
from functools import partial
from random import choice
from string import ascii_lowercase
from typing import TYPE_CHECKING, cast
from fixtures.common_types import Lsn, TenantId, TimelineId
from fixtures.common_types import Lsn
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnv,
NeonEnvBuilder,
PgProtocol,
logical_replication_sync,
wait_for_last_flush_lsn,
)
from fixtures.utils import wait_until
if TYPE_CHECKING:
from fixtures.neon_fixtures import (
Endpoint,
NeonEnv,
NeonEnvBuilder,
PgProtocol,
VanillaPostgres,
)
def random_string(n: int):
return "".join([choice(ascii_lowercase) for _ in range(n)])
def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
env = neon_simple_env
tenant_id = env.initial_tenant
@@ -167,10 +160,10 @@ COMMIT;
# Test that neon.logical_replication_max_snap_files works
def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
def slot_removed(ep: Endpoint):
def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
def slot_removed(ep):
assert (
ep.safe_psql(
endpoint.safe_psql(
"select count(*) from pg_replication_slots where slot_name = 'stale_slot'"
)[0][0]
== 0
@@ -261,7 +254,7 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
# Tests that walsender correctly blocks until WAL is downloaded from safekeepers
def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg: VanillaPostgres):
def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg):
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()
@@ -343,13 +336,13 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
#
# Most pages start with a contrecord, so we don't do anything special
# to ensure that.
def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
env = neon_simple_env
env.create_branch("init")
endpoint = env.endpoints.create_start("init")
tenant_id = TenantId(cast("str", endpoint.safe_psql("show neon.tenant_id")[0][0]))
timeline_id = TimelineId(cast("str", endpoint.safe_psql("show neon.timeline_id")[0][0]))
tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
cur = endpoint.connect().cursor()
cur.execute("create table t(key int, value text)")
@@ -387,7 +380,7 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres)
# logical replication bug as such, but without logical replication,
# records passed ot the WAL redo process are never large enough to hit
# the bug.
def test_large_records(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
def test_large_records(neon_simple_env: NeonEnv, vanilla_pg):
env = neon_simple_env
env.create_branch("init")
@@ -529,20 +522,15 @@ def logical_replication_wait_flush_lsn_sync(publisher: PgProtocol) -> Lsn:
because for some WAL records like vacuum subscriber won't get any data at
all.
"""
publisher_flush_lsn = Lsn(
cast("str", publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
)
publisher_flush_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
def check_caughtup():
res = cast(
"tuple[str, str, str]",
publisher.safe_psql(
"""
res = publisher.safe_psql(
"""
select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication sr, pg_replication_slots s
where s.active_pid = sr.pid and s.slot_type = 'logical';
"""
)[0],
)
)[0]
sent_lsn, flush_lsn, curr_publisher_flush_lsn = Lsn(res[0]), Lsn(res[1]), Lsn(res[2])
log.info(
f"sent_lsn={sent_lsn}, flush_lsn={flush_lsn}, publisher_flush_lsn={curr_publisher_flush_lsn}, waiting flush_lsn to reach {publisher_flush_lsn}"
@@ -557,7 +545,7 @@ select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication
# flush_lsn reporting to publisher. Without this, subscriber may ack too far,
# losing data on restart because publisher implicitly advances positition given
# in START_REPLICATION to the confirmed_flush_lsn of the slot.
def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg):
env = neon_simple_env
# use vanilla as publisher to allow writes on it when safekeeper is down
vanilla_pg.configure(
@@ -605,7 +593,7 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg: Van
# logical_replication_wait_flush_lsn_sync is expected to hang while
# safekeeper is down.
vanilla_pg.safe_psql("checkpoint;")
assert cast("int", sub.safe_psql_scalar("SELECT count(*) FROM t")) == 1000
assert sub.safe_psql_scalar("SELECT count(*) FROM t") == 1000
# restart subscriber and ensure it can catch up lost tail again
sub.stop(mode="immediate")

View File

@@ -1,5 +1,6 @@
from __future__ import annotations
import os
import subprocess
from pathlib import Path
from typing import cast
@@ -14,7 +15,7 @@ from fixtures.neon_fixtures import (
parse_project_git_version_output,
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.utils import run_only_on_default_postgres, skip_in_debug_build
from fixtures.pg_version import PgVersion, skip_on_postgres
def helper_compare_timeline_list(
@@ -194,8 +195,10 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
res.check_returncode()
@run_only_on_default_postgres(reason="does not use postgres")
@skip_in_debug_build("unit test for test support, either build works")
@skip_on_postgres(PgVersion.V14, reason="does not use postgres")
@pytest.mark.skipif(
os.environ.get("BUILD_TYPE") == "debug", reason="unit test for test support, either build works"
)
def test_parse_project_git_version_output_positive():
commit = "b6f77b5816cf1dba12a3bc8747941182ce220846"
@@ -214,8 +217,10 @@ def test_parse_project_git_version_output_positive():
assert parse_project_git_version_output(example) == commit
@run_only_on_default_postgres(reason="does not use postgres")
@skip_in_debug_build("unit test for test support, either build works")
@skip_on_postgres(PgVersion.V14, reason="does not use postgres")
@pytest.mark.skipif(
os.environ.get("BUILD_TYPE") == "debug", reason="unit test for test support, either build works"
)
def test_parse_project_git_version_output_local_docker():
"""
Makes sure the tests don't accept the default version in Dockerfile one gets without providing
@@ -229,8 +234,10 @@ def test_parse_project_git_version_output_local_docker():
assert input in str(e)
@run_only_on_default_postgres(reason="does not use postgres")
@skip_in_debug_build("unit test for test support, either build works")
@skip_on_postgres(PgVersion.V14, reason="does not use postgres")
@pytest.mark.skipif(
os.environ.get("BUILD_TYPE") == "debug", reason="cli api sanity, either build works"
)
def test_binaries_version_parses(neon_binpath: Path):
"""
Ensures that we can parse the actual outputs of --version from a set of binaries.

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
import asyncio
import os
import time
from typing import TYPE_CHECKING
@@ -15,7 +16,7 @@ from fixtures.neon_fixtures import (
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
from fixtures.utils import skip_in_debug_build, wait_until
from fixtures.utils import wait_until
if TYPE_CHECKING:
from typing import Optional
@@ -226,9 +227,12 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
assert get_dirty_bytes(env) >= dirty_after_write
# We have to use at least ~100MB of data to hit the lowest limit we can configure, which is
# prohibitively slow in debug mode
@skip_in_debug_build("Avoid running bulkier ingest tests in debug mode")
@pytest.mark.skipif(
# We have to use at least ~100MB of data to hit the lowest limit we can configure, which is
# prohibitively slow in debug mode
os.getenv("BUILD_TYPE") == "debug",
reason="Avoid running bulkier ingest tests in debug mode",
)
def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
"""
Test that checkpoints are done based on total ephemeral layer size, even if no one timeline is

View File

@@ -8,7 +8,7 @@ import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnvBuilder
from fixtures.remote_storage import s3_storage
from fixtures.utils import skip_in_debug_build, wait_until
from fixtures.utils import wait_until
# Test restarting page server, while safekeeper and compute node keep
@@ -155,8 +155,12 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
# safekeeper and compute node keep running.
@pytest.mark.timeout(540)
@pytest.mark.parametrize("shard_count", [None, 4])
@skip_in_debug_build("times out in debug builds")
def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
def test_pageserver_chaos(
neon_env_builder: NeonEnvBuilder, build_type: str, shard_count: Optional[int]
):
if build_type == "debug":
pytest.skip("times out in debug builds")
# same rationale as with the immediate stop; we might leave orphan layers behind.
neon_env_builder.disable_scrub_on_exit()
neon_env_builder.enable_pageserver_remote_storage(s3_storage())

View File

@@ -17,7 +17,7 @@ from fixtures.pageserver.utils import (
wait_for_upload_queue_empty,
)
from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage
from fixtures.utils import skip_in_debug_build, wait_until
from fixtures.utils import wait_until
from fixtures.workload import Workload
from werkzeug.wrappers.request import Request
from werkzeug.wrappers.response import Response
@@ -765,7 +765,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
assert download_rate < expect_download_rate * 2
@skip_in_debug_build("only run with release build")
@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
@pytest.mark.parametrize("via_controller", [True, False])
def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool):
"""

View File

@@ -3,6 +3,7 @@
#
from __future__ import annotations
import os
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import TYPE_CHECKING, cast
@@ -18,7 +19,6 @@ from fixtures.neon_fixtures import (
)
from fixtures.pg_version import PgVersion
from fixtures.remote_storage import s3_storage
from fixtures.utils import skip_in_debug_build
if TYPE_CHECKING:
from typing import Optional
@@ -329,7 +329,7 @@ def test_sql_regress(
post_checks(env, test_output_dir, DBNAME, endpoint)
@skip_in_debug_build("only run with release build")
@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
def test_tx_abort_with_many_relations(
neon_env_builder: NeonEnvBuilder,
):

View File

@@ -5,8 +5,7 @@ import time
from fixtures.neon_fixtures import NeonEnv, logical_replication_sync
def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonEnv, vanilla_pg):
"""Test read replica of a primary which has a logical replication publication"""
def test_physical_and_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
env = neon_simple_env
n_records = 100000
@@ -14,6 +13,7 @@ def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonE
primary = env.endpoints.create_start(
branch_name="main",
endpoint_id="primary",
config_lines=["min_wal_size=32MB", "max_wal_size=64MB"],
)
p_con = primary.connect()
p_cur = p_con.cursor()
@@ -30,6 +30,7 @@ def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonE
secondary = env.endpoints.new_replica_start(
origin=primary,
endpoint_id="secondary",
config_lines=["min_wal_size=32MB", "max_wal_size=64MB"],
)
s_con = secondary.connect()
@@ -47,51 +48,3 @@ def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonE
# Check that LR slot is not copied to replica
s_cur.execute("select count(*) from pg_replication_slots")
assert s_cur.fetchall()[0][0] == 0
def test_aux_not_logged_at_replica(neon_simple_env: NeonEnv, vanilla_pg):
"""Test that AUX files are not saved at replica"""
env = neon_simple_env
n_records = 200000
primary = env.endpoints.create_start(
branch_name="main",
endpoint_id="primary",
)
p_con = primary.connect()
p_cur = p_con.cursor()
p_cur.execute("CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))")
p_cur.execute("create publication pub1 for table t")
# start subscriber
vanilla_pg.start()
vanilla_pg.safe_psql("CREATE TABLE t(pk bigint primary key, payload text)")
connstr = primary.connstr().replace("'", "''")
vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
for pk in range(n_records):
p_cur.execute("insert into t (pk) values (%s)", (pk,))
# LR snapshot is stored each 15 seconds
time.sleep(16)
# start replica
secondary = env.endpoints.new_replica_start(
origin=primary,
endpoint_id="secondary",
)
s_con = secondary.connect()
s_cur = s_con.cursor()
logical_replication_sync(vanilla_pg, primary)
assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records
s_cur.execute("select count(*) from t")
assert s_cur.fetchall()[0][0] == n_records
primary.stop()
time.sleep(1)
secondary.stop()
assert not secondary.log_contains("cannot make new WAL entries during recovery")

View File

@@ -30,7 +30,7 @@ import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import NeonEnv, wait_for_last_flush_lsn, wait_replica_caughtup
from fixtures.pg_version import PgVersion
from fixtures.utils import query_scalar, skip_on_postgres, wait_until
from fixtures.utils import query_scalar, wait_until
CREATE_SUBXACTS_FUNC = """
create or replace function create_subxacts(n integer) returns void as $$
@@ -137,12 +137,6 @@ def test_replica_start_scan_clog_crashed_xids(neon_simple_env: NeonEnv):
assert secondary_cur.fetchone() == (1,)
@skip_on_postgres(
PgVersion.V14, reason="pg_log_standby_snapshot() function is available since Postgres 16"
)
@skip_on_postgres(
PgVersion.V15, reason="pg_log_standby_snapshot() function is available since Postgres 16"
)
def test_replica_start_at_running_xacts(neon_simple_env: NeonEnv, pg_version):
"""
Test that starting a replica works right after the primary has
@@ -155,6 +149,9 @@ def test_replica_start_at_running_xacts(neon_simple_env: NeonEnv, pg_version):
"""
env = neon_simple_env
if env.pg_version == PgVersion.V14 or env.pg_version == PgVersion.V15:
pytest.skip("pg_log_standby_snapshot() function is available only in PG16")
primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
primary_conn = primary.connect()
primary_cur = primary_conn.cursor()

View File

@@ -20,7 +20,7 @@ from fixtures.neon_fixtures import (
)
from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty
from fixtures.remote_storage import s3_storage
from fixtures.utils import skip_in_debug_build, wait_until
from fixtures.utils import wait_until
from fixtures.workload import Workload
from pytest_httpserver import HTTPServer
from typing_extensions import override
@@ -853,9 +853,12 @@ def test_sharding_split_stripe_size(
wait_until(10, 1, assert_restart_notification)
# The quantity of data isn't huge, but debug can be _very_ slow, and the things we're
# validating in this test don't benefit much from debug assertions.
@skip_in_debug_build("Avoid running bulkier ingest tests in debug mode")
@pytest.mark.skipif(
# The quantity of data isn't huge, but debug can be _very_ slow, and the things we're
# validating in this test don't benefit much from debug assertions.
os.getenv("BUILD_TYPE") == "debug",
reason="Avoid running bulkier ingest tests in debug mode",
)
def test_sharding_ingest_layer_sizes(
neon_env_builder: NeonEnvBuilder,
):

View File

@@ -36,12 +36,11 @@ from fixtures.pageserver.utils import (
remote_storage_delete_key,
timeline_delete_wait_completed,
)
from fixtures.pg_version import PgVersion
from fixtures.pg_version import PgVersion, run_only_on_default_postgres
from fixtures.port_distributor import PortDistributor
from fixtures.remote_storage import RemoteStorageKind, s3_storage
from fixtures.storage_controller_proxy import StorageControllerProxy
from fixtures.utils import (
run_only_on_default_postgres,
run_pg_bench_small,
subprocess_capture,
wait_until,

View File

@@ -1,5 +1,6 @@
from __future__ import annotations
import os
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
@@ -20,7 +21,7 @@ from fixtures.pageserver.utils import (
wait_until_tenant_active,
)
from fixtures.pg_version import PgVersion
from fixtures.utils import skip_in_debug_build, wait_until
from fixtures.utils import wait_until
def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder):
@@ -278,7 +279,7 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa
size_debug_file.write(size_debug)
@skip_in_debug_build("only run with release build")
@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
def test_single_branch_get_tenant_size_grows(
neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion
):

View File

@@ -1,22 +1,15 @@
from __future__ import annotations
import json
from typing import Optional
import pytest
from fixtures.common_types import TenantId, TimelineArchivalState, TimelineId
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
NeonEnvBuilder,
last_flush_lsn_upload,
)
from fixtures.pageserver.http import PageserverApiException
from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty, list_prefix
from fixtures.remote_storage import S3Storage, s3_storage
from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty
from fixtures.remote_storage import s3_storage
from fixtures.utils import wait_until
from mypy_boto3_s3.type_defs import (
ObjectTypeDef,
)
@pytest.mark.parametrize("shard_count", [0, 4])
@@ -376,146 +369,3 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timel
neon_env_builder.pageserver_remote_storage,
prefix=f"tenants/{str(tenant_id)}/",
)
@pytest.mark.parametrize("offload_child", ["offload", "offload-corrupt", "archive", None])
def test_timeline_retain_lsn(neon_env_builder: NeonEnvBuilder, offload_child: Optional[str]):
"""
Ensure that retain_lsn functionality for timelines works, both for offloaded and non-offloaded ones
"""
if offload_child == "offload-corrupt":
# Our corruption code only works with S3 compatible storage
neon_env_builder.enable_pageserver_remote_storage(s3_storage())
env = neon_env_builder.init_start()
ps_http = env.pageserver.http_client()
# Turn off gc and compaction loops: we want to issue them manually for better reliability
tenant_id, root_timeline_id = env.create_tenant(
conf={
# small checkpointing and compaction targets to ensure we generate many upload operations
"checkpoint_distance": 128 * 1024,
"compaction_threshold": 1,
"compaction_target_size": 128 * 1024,
# set small image creation thresholds so that gc deletes data
"image_creation_threshold": 2,
# disable background compaction and GC. We invoke it manually when we want it to happen.
"gc_period": "0s",
"compaction_period": "0s",
# Disable pitr, we only want the latest lsn
"pitr_interval": "0s",
# Don't rely on endpoint lsn leases
"lsn_lease_length": "0s",
}
)
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
endpoint.safe_psql_many(
[
"CREATE TABLE foo(v int, key serial primary key, t text default 'data_content')",
"SELECT setseed(0.4321)",
"INSERT INTO foo SELECT v FROM (SELECT generate_series(1,2048), (random() * 409600)::int as v) as random_numbers",
]
)
pre_branch_sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
log.info(f"Pre branch sum: {pre_branch_sum}")
last_flush_lsn_upload(env, endpoint, tenant_id, root_timeline_id)
# Create a branch and write some additional data to the parent
child_timeline_id = env.create_branch("test_archived_branch", tenant_id)
with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
# Do some churn of the data. This is important so that we can overwrite image layers.
for i in range(10):
endpoint.safe_psql_many(
[
f"SELECT setseed(0.23{i})",
"UPDATE foo SET v=(random() * 409600)::int WHERE v % 3 = 2",
"UPDATE foo SET v=(random() * 409600)::int WHERE v % 3 = 1",
"UPDATE foo SET v=(random() * 409600)::int WHERE v % 3 = 0",
]
)
post_branch_sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
log.info(f"Post branch sum: {post_branch_sum}")
last_flush_lsn_upload(env, endpoint, tenant_id, root_timeline_id)
if offload_child is not None:
ps_http.timeline_archival_config(
tenant_id,
child_timeline_id,
state=TimelineArchivalState.ARCHIVED,
)
leaf_detail = ps_http.timeline_detail(
tenant_id,
child_timeline_id,
)
assert leaf_detail["is_archived"] is True
if "offload" in offload_child:
ps_http.timeline_offload(tenant_id, child_timeline_id)
# Do a restart to get rid of any in-memory objects (we only init gc info once, at attach)
env.pageserver.stop()
if offload_child == "offload-corrupt":
assert isinstance(env.pageserver_remote_storage, S3Storage)
listing = list_prefix(
env.pageserver_remote_storage, f"tenants/{str(tenant_id)}/tenant-manifest"
)
objects: list[ObjectTypeDef] = listing.get("Contents", [])
assert len(objects) > 0
remote_key: str = str(objects[0].get("Key", []))
local_path = str(env.repo_dir / "tenant-manifest.json")
log.info(f"Downloading {remote_key} -> {local_path}")
env.pageserver_remote_storage.client.download_file(
env.pageserver_remote_storage.bucket_name, remote_key, local_path
)
log.info(f"Corrupting {local_path}")
with open(local_path) as manifest_json_file:
manifest_json = json.load(manifest_json_file)
for offloaded_timeline in manifest_json["offloaded_timelines"]:
offloaded_timeline["ancestor_retain_lsn"] = None
with open(local_path, "w") as manifest_json_file:
json.dump(manifest_json, manifest_json_file)
log.info(f"Uploading {local_path} -> {remote_key}")
env.pageserver_remote_storage.client.upload_file(
local_path, env.pageserver_remote_storage.bucket_name, remote_key
)
# The point of our earlier efforts was to provoke these
env.pageserver.allowed_errors.extend(
[
".*initial size calculation failed: PageRead.MissingKey.could not find data for key.*",
".*page_service_conn_main.*could not find data for key.*",
]
)
env.pageserver.start()
# Do an agressive gc and compaction of the parent branch
ps_http.timeline_gc(tenant_id=tenant_id, timeline_id=root_timeline_id, gc_horizon=0)
ps_http.timeline_checkpoint(
tenant_id,
root_timeline_id,
force_l0_compaction=True,
force_repartition=True,
wait_until_uploaded=True,
compact=True,
)
if offload_child is not None:
ps_http.timeline_archival_config(
tenant_id,
child_timeline_id,
state=TimelineArchivalState.UNARCHIVED,
)
# Now, after unarchival, the child timeline should still have its data accessible (or corrupted)
if offload_child == "offload-corrupt":
with pytest.raises(RuntimeError, match=".*failed to get basebackup.*"):
env.endpoints.create_start(
"test_archived_branch", tenant_id=tenant_id, basebackup_request_tries=1
)
else:
with env.endpoints.create_start("test_archived_branch", tenant_id=tenant_id) as endpoint:
sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
assert sum == pre_branch_sum

View File

@@ -869,17 +869,8 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
assert count == 10000
@pytest.mark.parametrize(
"mode, sharded",
[
("delete_timeline", False),
("delete_timeline", True),
("delete_tenant", False),
# the shared/exclusive lock for tenant is blocking this:
# timeline detach ancestor takes shared, delete tenant takes exclusive
# ("delete_tenant", True)
],
)
@pytest.mark.parametrize("mode", ["delete_timeline", "delete_tenant"])
@pytest.mark.parametrize("sharded", [False, True])
def test_timeline_detach_ancestor_interrupted_by_deletion(
neon_env_builder: NeonEnvBuilder, mode: str, sharded: bool
):
@@ -894,6 +885,11 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
- shutdown winning over complete, see test_timeline_is_deleted_before_timeline_detach_ancestor_completes
"""
if sharded and mode == "delete_tenant":
# the shared/exclusive lock for tenant is blocking this:
# timeline detach ancestor takes shared, delete tenant takes exclusive
pytest.skip("tenant deletion while timeline ancestor detach is underway cannot happen")
shard_count = 2 if sharded else 1
neon_env_builder.num_pageservers = shard_count

View File

@@ -54,8 +54,6 @@ from fixtures.utils import (
PropagatingThread,
get_dir_size,
query_scalar,
run_only_on_default_postgres,
skip_in_debug_build,
start_in_background,
wait_until,
)
@@ -2106,9 +2104,10 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
# The only way to verify this without manipulating time is to sleep for a while.
# In this test we sleep for 60 seconds, so this test takes at least 1 minute to run.
# This is longer than most other tests, we run it only for v16 to save CI resources.
@run_only_on_default_postgres("run only on release build to save CI resources")
@skip_in_debug_build("run only on release build to save CI resources")
def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
if os.environ.get("PYTEST_CURRENT_TEST", "").find("[debug-pg16]") == -1:
pytest.skip("run only on debug postgres v16 to save CI resources")
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()

View File

@@ -14,7 +14,6 @@ from fixtures.common_types import Lsn, TenantId, TimelineId
from fixtures.log_helper import getLogger
from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper
from fixtures.remote_storage import RemoteStorageKind
from fixtures.utils import skip_in_debug_build
if TYPE_CHECKING:
from typing import Optional
@@ -761,8 +760,10 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint, test_output_dir: Pat
# The test takes more than default 5 minutes on Postgres 16,
# see https://github.com/neondatabase/neon/issues/5305
@pytest.mark.timeout(600)
@skip_in_debug_build("times out in debug builds")
def test_wal_lagging(neon_env_builder: NeonEnvBuilder, test_output_dir: Path, build_type: str):
if build_type == "debug":
pytest.skip("times out in debug builds")
neon_env_builder.num_safekeepers = 3
env = neon_env_builder.init_start()

View File

@@ -1,18 +1,18 @@
{
"v17": [
"17.0",
"ae4cc30dba24f3910533e5a48e8103c3f2fff300"
"9ad2f3c5c37c08069a01c1e3f6b7cf275437e0cb"
],
"v16": [
"16.4",
"03b43900edc5d8d6eecec460bfc89aec7174bd84"
"e131a9c027b202ce92bd7b9cf2569d48a6f9948e"
],
"v15": [
"15.8",
"fd631a959049dfe2b82f67409c8b8b0d3e0016d1"
"22e580fe9ffcea7e02592110b1c9bf426d83cada"
],
"v14": [
"14.13",
"de0a000dafc2e66ce2e39282d3aa1c704fe0390e"
"2199b83fb72680001ce0f43bf6187a21dfb8f45d"
]
}