mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-18 10:52:55 +00:00
Compare commits
4 Commits
erik/segme
...
remove-pos
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dc8ca6aaa1 | ||
|
|
af50fd76b7 | ||
|
|
da16233f64 | ||
|
|
80466bdca2 |
1
.github/actionlint.yml
vendored
1
.github/actionlint.yml
vendored
@@ -20,4 +20,3 @@ config-variables:
|
||||
- REMOTE_STORAGE_AZURE_REGION
|
||||
- SLACK_UPCOMING_RELEASE_CHANNEL_ID
|
||||
- DEV_AWS_OIDC_ROLE_ARN
|
||||
- BENCHMARK_INGEST_TARGET_PROJECTID
|
||||
|
||||
@@ -221,8 +221,6 @@ runs:
|
||||
REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
script: |
|
||||
const { REPORT_URL, COMMIT_SHA } = process.env
|
||||
|
||||
|
||||
36
.github/actions/set-docker-config-dir/action.yml
vendored
Normal file
36
.github/actions/set-docker-config-dir/action.yml
vendored
Normal file
@@ -0,0 +1,36 @@
|
||||
name: "Set custom docker config directory"
|
||||
description: "Create a directory for docker config and set DOCKER_CONFIG"
|
||||
|
||||
# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
|
||||
runs:
|
||||
using: "composite"
|
||||
steps:
|
||||
- name: Show warning on GitHub-hosted runners
|
||||
if: runner.environment == 'github-hosted'
|
||||
shell: bash -euo pipefail {0}
|
||||
run: |
|
||||
# Using the following environment variables to find a path to the workflow file
|
||||
# ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch
|
||||
# ${GITHUB_REPOSITORY} - octocat/hello-world
|
||||
# ${GITHUB_REF} - refs/heads/my_branch
|
||||
# From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables
|
||||
|
||||
filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"}
|
||||
filename=${filename_with_ref%"@$GITHUB_REF"}
|
||||
|
||||
# https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message
|
||||
title='Unnecessary usage of `.github/actions/set-docker-config-dir`'
|
||||
message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners'
|
||||
echo "::warning file=${filename},title=${title}::${message}"
|
||||
|
||||
- uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7
|
||||
env:
|
||||
DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }}
|
||||
with:
|
||||
main: |
|
||||
mkdir -p "${DOCKER_CONFIG}"
|
||||
echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV
|
||||
post: |
|
||||
if [ -d "${DOCKER_CONFIG}" ]; then
|
||||
rm -r "${DOCKER_CONFIG}"
|
||||
fi
|
||||
11
.github/pull_request_template.md
vendored
11
.github/pull_request_template.md
vendored
@@ -1,3 +1,14 @@
|
||||
## Problem
|
||||
|
||||
## Summary of changes
|
||||
|
||||
## Checklist before requesting a review
|
||||
|
||||
- [ ] I have performed a self-review of my code.
|
||||
- [ ] If it is a core feature, I have added thorough tests.
|
||||
- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
|
||||
- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.
|
||||
|
||||
## Checklist before merging
|
||||
|
||||
- [ ] Do not forget to reformat commit message to not include the above checklist
|
||||
|
||||
37
.github/workflows/_check-codestyle-python.yml
vendored
37
.github/workflows/_check-codestyle-python.yml
vendored
@@ -1,37 +0,0 @@
|
||||
name: Check Codestyle Python
|
||||
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
build-tools-image:
|
||||
description: 'build-tools image'
|
||||
required: true
|
||||
type: string
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
jobs:
|
||||
check-codestyle-python:
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: ${{ inputs.build-tools-image }}
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- run: ./scripts/pysync
|
||||
|
||||
- run: poetry run ruff check .
|
||||
- run: poetry run ruff format --check .
|
||||
- run: poetry run mypy .
|
||||
6
.github/workflows/benchmarking.yml
vendored
6
.github/workflows/benchmarking.yml
vendored
@@ -671,10 +671,6 @@ jobs:
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
# Increase timeout to 12h, default timeout is 6h
|
||||
# we have regression in clickbench causing it to run 2-3x longer
|
||||
timeout-minutes: 720
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
@@ -720,7 +716,7 @@ jobs:
|
||||
test_selection: performance/test_perf_olap.py
|
||||
run_in_parallel: false
|
||||
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
|
||||
extra_params: -m remote_cluster --timeout 43200 -k test_clickbench
|
||||
extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
|
||||
pg_version: ${{ env.DEFAULT_PG_VERSION }}
|
||||
env:
|
||||
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
|
||||
|
||||
@@ -64,7 +64,7 @@ jobs:
|
||||
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
|
||||
- uses: ./.github/actions/set-docker-config-dir
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
cache-binary: false
|
||||
|
||||
44
.github/workflows/build_and_test.yml
vendored
44
.github/workflows/build_and_test.yml
vendored
@@ -90,10 +90,35 @@ jobs:
|
||||
|
||||
check-codestyle-python:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
uses: ./.github/workflows/_check-codestyle-python.yml
|
||||
with:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
secrets: inherit
|
||||
runs-on: [ self-hosted, small ]
|
||||
container:
|
||||
image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Cache poetry deps
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pypoetry/virtualenvs
|
||||
key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
|
||||
|
||||
- name: Install Python deps
|
||||
run: ./scripts/pysync
|
||||
|
||||
- name: Run `ruff check` to ensure code format
|
||||
run: poetry run ruff check .
|
||||
|
||||
- name: Run `ruff format` to ensure code format
|
||||
run: poetry run ruff format --check .
|
||||
|
||||
- name: Run mypy to check types
|
||||
run: poetry run mypy .
|
||||
|
||||
check-codestyle-jsonnet:
|
||||
needs: [ check-permissions, build-build-tools-image ]
|
||||
@@ -116,7 +141,6 @@ jobs:
|
||||
# Check that the vendor/postgres-* submodules point to the
|
||||
# corresponding REL_*_STABLE_neon branches.
|
||||
check-submodules:
|
||||
needs: [ check-permissions ]
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -497,8 +521,6 @@ jobs:
|
||||
REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
|
||||
COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
script: |
|
||||
const { REPORT_URL_NEW, COMMIT_SHA } = process.env
|
||||
|
||||
@@ -530,7 +552,7 @@ jobs:
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
|
||||
- uses: ./.github/actions/set-docker-config-dir
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
cache-binary: false
|
||||
@@ -621,7 +643,7 @@ jobs:
|
||||
with:
|
||||
submodules: true
|
||||
|
||||
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
|
||||
- uses: ./.github/actions/set-docker-config-dir
|
||||
- uses: docker/setup-buildx-action@v3
|
||||
with:
|
||||
cache-binary: false
|
||||
@@ -802,7 +824,7 @@ jobs:
|
||||
curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
|
||||
chmod +x vm-builder
|
||||
|
||||
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
|
||||
- uses: ./.github/actions/set-docker-config-dir
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
@@ -838,7 +860,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
|
||||
- uses: ./.github/actions/set-docker-config-dir
|
||||
- uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
|
||||
372
.github/workflows/ingest_benchmark.yml
vendored
372
.github/workflows/ingest_benchmark.yml
vendored
@@ -1,372 +0,0 @@
|
||||
name: Benchmarking
|
||||
|
||||
on:
|
||||
# uncomment to run on push for debugging your PR
|
||||
# push:
|
||||
# branches: [ your branch ]
|
||||
schedule:
|
||||
# * is a special character in YAML so you have to quote this string
|
||||
# ┌───────────── minute (0 - 59)
|
||||
# │ ┌───────────── hour (0 - 23)
|
||||
# │ │ ┌───────────── day of the month (1 - 31)
|
||||
# │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
|
||||
# │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
|
||||
- cron: '0 9 * * *' # run once a day, timezone is utc
|
||||
workflow_dispatch: # adds ability to run this manually
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
concurrency:
|
||||
# Allow only one workflow globally because we need dedicated resources which only exist once
|
||||
group: ingest-bench-workflow
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
ingest:
|
||||
strategy:
|
||||
matrix:
|
||||
target_project: [new_empty_project, large_existing_project]
|
||||
permissions:
|
||||
contents: write
|
||||
statuses: write
|
||||
id-token: write # aws-actions/configure-aws-credentials
|
||||
env:
|
||||
PG_CONFIG: /tmp/neon/pg_install/v16/bin/pg_config
|
||||
PSQL: /tmp/neon/pg_install/v16/bin/psql
|
||||
PG_16_LIB_PATH: /tmp/neon/pg_install/v16/lib
|
||||
PGCOPYDB: /pgcopydb/bin/pgcopydb
|
||||
PGCOPYDB_LIB_PATH: /pgcopydb/lib
|
||||
runs-on: [ self-hosted, us-east-2, x64 ]
|
||||
container:
|
||||
image: neondatabase/build-tools:pinned-bookworm
|
||||
credentials:
|
||||
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
|
||||
options: --init
|
||||
timeout-minutes: 1440
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- name: Configure AWS credentials # necessary to download artefacts
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
with:
|
||||
aws-region: eu-central-1
|
||||
role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
|
||||
role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role
|
||||
|
||||
- name: Download Neon artifact
|
||||
uses: ./.github/actions/download
|
||||
with:
|
||||
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
|
||||
path: /tmp/neon/
|
||||
prefix: latest
|
||||
|
||||
- name: Create Neon Project
|
||||
if: ${{ matrix.target_project == 'new_empty_project' }}
|
||||
id: create-neon-project-ingest-target
|
||||
uses: ./.github/actions/neon-project-create
|
||||
with:
|
||||
region_id: aws-us-east-2
|
||||
postgres_version: 16
|
||||
compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Initialize Neon project and retrieve current backpressure seconds
|
||||
if: ${{ matrix.target_project == 'new_empty_project' }}
|
||||
env:
|
||||
NEW_PROJECT_CONNSTR: ${{ steps.create-neon-project-ingest-target.outputs.dsn }}
|
||||
NEW_PROJECT_ID: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
|
||||
run: |
|
||||
echo "Initializing Neon project with project_id: ${NEW_PROJECT_ID}"
|
||||
export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
|
||||
${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
|
||||
BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
|
||||
echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV
|
||||
echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV
|
||||
|
||||
- name: Create Neon Branch for large tenant
|
||||
if: ${{ matrix.target_project == 'large_existing_project' }}
|
||||
id: create-neon-branch-ingest-target
|
||||
uses: ./.github/actions/neon-branch-create
|
||||
with:
|
||||
project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Initialize Neon project and retrieve current backpressure seconds
|
||||
if: ${{ matrix.target_project == 'large_existing_project' }}
|
||||
env:
|
||||
NEW_PROJECT_CONNSTR: ${{ steps.create-neon-branch-ingest-target.outputs.dsn }}
|
||||
NEW_BRANCH_ID: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }}
|
||||
run: |
|
||||
echo "Initializing Neon branch with branch_id: ${NEW_BRANCH_ID}"
|
||||
export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
|
||||
# Extract the part before the database name
|
||||
base_connstr="${NEW_PROJECT_CONNSTR%/*}"
|
||||
# Extract the query parameters (if any) after the database name
|
||||
query_params="${NEW_PROJECT_CONNSTR#*\?}"
|
||||
# Reconstruct the new connection string
|
||||
if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then
|
||||
new_connstr="${base_connstr}/neondb?${query_params}"
|
||||
else
|
||||
new_connstr="${base_connstr}/neondb"
|
||||
fi
|
||||
${PSQL} "${new_connstr}" -c "drop database ludicrous;"
|
||||
${PSQL} "${new_connstr}" -c "CREATE DATABASE ludicrous;"
|
||||
if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then
|
||||
NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous?${query_params}"
|
||||
else
|
||||
NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous"
|
||||
fi
|
||||
${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
|
||||
BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
|
||||
echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV
|
||||
echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV
|
||||
|
||||
|
||||
- name: Create pgcopydb filter file
|
||||
run: |
|
||||
cat << EOF > /tmp/pgcopydb_filter.txt
|
||||
[include-only-table]
|
||||
public.events
|
||||
public.emails
|
||||
public.email_transmissions
|
||||
public.payments
|
||||
public.editions
|
||||
public.edition_modules
|
||||
public.sp_content
|
||||
public.email_broadcasts
|
||||
public.user_collections
|
||||
public.devices
|
||||
public.user_accounts
|
||||
public.lessons
|
||||
public.lesson_users
|
||||
public.payment_methods
|
||||
public.orders
|
||||
public.course_emails
|
||||
public.modules
|
||||
public.users
|
||||
public.module_users
|
||||
public.courses
|
||||
public.payment_gateway_keys
|
||||
public.accounts
|
||||
public.roles
|
||||
public.payment_gateways
|
||||
public.management
|
||||
public.event_names
|
||||
EOF
|
||||
|
||||
- name: Invoke pgcopydb
|
||||
env:
|
||||
BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }}
|
||||
run: |
|
||||
export LD_LIBRARY_PATH=${PGCOPYDB_LIB_PATH}:${PG_16_LIB_PATH}
|
||||
export PGCOPYDB_SOURCE_PGURI="${BENCHMARK_INGEST_SOURCE_CONNSTR}"
|
||||
export PGCOPYDB_TARGET_PGURI="${NEW_PROJECT_CONNSTR}"
|
||||
export PGOPTIONS="-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
|
||||
${PG_CONFIG} --bindir
|
||||
${PGCOPYDB} --version
|
||||
${PGCOPYDB} clone --skip-vacuum --no-owner --no-acl --skip-db-properties --table-jobs 4 \
|
||||
--index-jobs 4 --restore-jobs 4 --split-tables-larger-than 10GB --skip-extensions \
|
||||
--use-copy-binary --filters /tmp/pgcopydb_filter.txt 2>&1 | tee /tmp/pgcopydb_${{ matrix.target_project }}.log
|
||||
|
||||
# create dummy pgcopydb log to test parsing
|
||||
# - name: create dummy log for parser test
|
||||
# run: |
|
||||
# cat << EOF > /tmp/pgcopydb_${{ matrix.target_project }}.log
|
||||
# 2024-11-04 18:00:53.433 500861 INFO main.c:136 Running pgcopydb version 0.17.10.g8361a93 from "/usr/lib/postgresql/17/bin/pgcopydb"
|
||||
# 2024-11-04 18:00:53.434 500861 INFO cli_common.c:1225 [SOURCE] Copying database from "postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60"
|
||||
# 2024-11-04 18:00:53.434 500861 INFO cli_common.c:1226 [TARGET] Copying database into "postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60"
|
||||
# 2024-11-04 18:00:53.442 500861 INFO copydb.c:105 Using work dir "/tmp/pgcopydb"
|
||||
# 2024-11-04 18:00:53.541 500861 INFO snapshot.c:107 Exported snapshot "00000008-00000033-1" from the source database
|
||||
# 2024-11-04 18:00:53.556 500865 INFO cli_clone_follow.c:543 STEP 1: fetch source database tables, indexes, and sequences
|
||||
# 2024-11-04 18:00:54.570 500865 INFO copydb_schema.c:716 Splitting source candidate tables larger than 10 GB
|
||||
# 2024-11-04 18:00:54.570 500865 INFO copydb_schema.c:829 Table public.events is 96 GB large which is larger than --split-tables-larger-than 10 GB, and does not have a unique column of type integer: splitting by CTID
|
||||
# 2024-11-04 18:01:05.538 500865 INFO copydb_schema.c:905 Table public.events is 96 GB large, 10 COPY processes will be used, partitioning on ctid.
|
||||
# 2024-11-04 18:01:05.564 500865 INFO copydb_schema.c:905 Table public.email_transmissions is 27 GB large, 4 COPY processes will be used, partitioning on id.
|
||||
# 2024-11-04 18:01:05.584 500865 INFO copydb_schema.c:905 Table public.lessons is 25 GB large, 4 COPY processes will be used, partitioning on id.
|
||||
# 2024-11-04 18:01:05.605 500865 INFO copydb_schema.c:905 Table public.lesson_users is 16 GB large, 3 COPY processes will be used, partitioning on id.
|
||||
# 2024-11-04 18:01:05.605 500865 INFO copydb_schema.c:761 Fetched information for 26 tables (including 4 tables split in 21 partitions total), with an estimated total of 907 million tuples and 175 GB on-disk
|
||||
# 2024-11-04 18:01:05.687 500865 INFO copydb_schema.c:968 Fetched information for 57 indexes (supporting 25 constraints)
|
||||
# 2024-11-04 18:01:05.753 500865 INFO sequences.c:78 Fetching information for 24 sequences
|
||||
# 2024-11-04 18:01:05.903 500865 INFO copydb_schema.c:1122 Fetched information for 4 extensions
|
||||
# 2024-11-04 18:01:06.178 500865 INFO copydb_schema.c:1538 Found 0 indexes (supporting 0 constraints) in the target database
|
||||
# 2024-11-04 18:01:06.184 500865 INFO cli_clone_follow.c:584 STEP 2: dump the source database schema (pre/post data)
|
||||
# 2024-11-04 18:01:06.186 500865 INFO pgcmd.c:468 /usr/lib/postgresql/16/bin/pg_dump -Fc --snapshot 00000008-00000033-1 --section=pre-data --section=post-data --file /tmp/pgcopydb/schema/schema.dump 'postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60'
|
||||
# 2024-11-04 18:01:06.952 500865 INFO cli_clone_follow.c:592 STEP 3: restore the pre-data section to the target database
|
||||
# 2024-11-04 18:01:07.004 500865 INFO pgcmd.c:1001 /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section pre-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/pre-filtered.list /tmp/pgcopydb/schema/schema.dump
|
||||
# 2024-11-04 18:01:07.438 500874 INFO table-data.c:656 STEP 4: starting 4 table-data COPY processes
|
||||
# 2024-11-04 18:01:07.451 500877 INFO vacuum.c:139 STEP 8: skipping VACUUM jobs per --skip-vacuum
|
||||
# 2024-11-04 18:01:07.457 500875 INFO indexes.c:182 STEP 6: starting 4 CREATE INDEX processes
|
||||
# 2024-11-04 18:01:07.457 500875 INFO indexes.c:183 STEP 7: constraints are built by the CREATE INDEX processes
|
||||
# 2024-11-04 18:01:07.507 500865 INFO blobs.c:74 Skipping large objects: none found.
|
||||
# 2024-11-04 18:01:07.509 500865 INFO sequences.c:194 STEP 9: reset sequences values
|
||||
# 2024-11-04 18:01:07.510 500886 INFO sequences.c:290 Set sequences values on the target database
|
||||
# 2024-11-04 20:49:00.587 500865 INFO cli_clone_follow.c:608 STEP 10: restore the post-data section to the target database
|
||||
# 2024-11-04 20:49:00.600 500865 INFO pgcmd.c:1001 /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section post-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/post-filtered.list /tmp/pgcopydb/schema/schema.dump
|
||||
# 2024-11-05 10:50:58.508 500865 INFO cli_clone_follow.c:639 All step are now done, 16h49m elapsed
|
||||
# 2024-11-05 10:50:58.508 500865 INFO summary.c:3155 Printing summary for 26 tables and 57 indexes
|
||||
|
||||
# OID | Schema | Name | Parts | copy duration | transmitted bytes | indexes | create index duration
|
||||
# ------+--------+----------------------+-------+---------------+-------------------+---------+----------------------
|
||||
# 24654 | public | events | 10 | 1d11h | 878 GB | 1 | 1h41m
|
||||
# 24623 | public | email_transmissions | 4 | 4h46m | 99 GB | 3 | 2h04m
|
||||
# 24665 | public | lessons | 4 | 4h42m | 161 GB | 4 | 1m11s
|
||||
# 24661 | public | lesson_users | 3 | 2h46m | 49 GB | 3 | 39m35s
|
||||
# 24631 | public | emails | 1 | 34m07s | 10 GB | 2 | 17s
|
||||
# 24739 | public | payments | 1 | 5m47s | 1848 MB | 4 | 4m40s
|
||||
# 24681 | public | module_users | 1 | 4m57s | 1610 MB | 3 | 1m50s
|
||||
# 24694 | public | orders | 1 | 2m50s | 835 MB | 3 | 1m05s
|
||||
# 24597 | public | devices | 1 | 1m45s | 498 MB | 2 | 40s
|
||||
# 24723 | public | payment_methods | 1 | 1m24s | 548 MB | 2 | 31s
|
||||
# 24765 | public | user_collections | 1 | 2m17s | 1005 MB | 2 | 968ms
|
||||
# 24774 | public | users | 1 | 52s | 291 MB | 4 | 27s
|
||||
# 24760 | public | user_accounts | 1 | 16s | 172 MB | 3 | 16s
|
||||
# 24606 | public | edition_modules | 1 | 8s983 | 46 MB | 3 | 4s749
|
||||
# 24583 | public | course_emails | 1 | 8s526 | 26 MB | 2 | 996ms
|
||||
# 24685 | public | modules | 1 | 1s592 | 21 MB | 3 | 1s696
|
||||
# 24610 | public | editions | 1 | 2s199 | 7483 kB | 2 | 1s032
|
||||
# 24755 | public | sp_content | 1 | 1s555 | 4177 kB | 0 | 0ms
|
||||
# 24619 | public | email_broadcasts | 1 | 744ms | 2645 kB | 2 | 677ms
|
||||
# 24590 | public | courses | 1 | 387ms | 1540 kB | 2 | 367ms
|
||||
# 24704 | public | payment_gateway_keys | 1 | 1s972 | 164 kB | 2 | 27ms
|
||||
# 24576 | public | accounts | 1 | 58ms | 24 kB | 1 | 14ms
|
||||
# 24647 | public | event_names | 1 | 32ms | 397 B | 1 | 8ms
|
||||
# 24716 | public | payment_gateways | 1 | 1s675 | 117 B | 1 | 11ms
|
||||
# 24748 | public | roles | 1 | 71ms | 173 B | 1 | 8ms
|
||||
# 24676 | public | management | 1 | 33ms | 40 B | 1 | 19ms
|
||||
|
||||
|
||||
# Step Connection Duration Transfer Concurrency
|
||||
# -------------------------------------------------- ---------- ---------- ---------- ------------
|
||||
# Catalog Queries (table ordering, filtering, etc) source 12s 1
|
||||
# Dump Schema source 765ms 1
|
||||
# Prepare Schema target 466ms 1
|
||||
# COPY, INDEX, CONSTRAINTS, VACUUM (wall clock) both 2h47m 12
|
||||
# COPY (cumulative) both 7h46m 1225 GB 4
|
||||
# CREATE INDEX (cumulative) target 4h36m 4
|
||||
# CONSTRAINTS (cumulative) target 8s493 4
|
||||
# VACUUM (cumulative) target 0ms 4
|
||||
# Reset Sequences both 60ms 1
|
||||
# Large Objects (cumulative) (null) 0ms 0
|
||||
# Finalize Schema both 14h01m 4
|
||||
# -------------------------------------------------- ---------- ---------- ---------- ------------
|
||||
# Total Wall Clock Duration both 16h49m 20
|
||||
|
||||
|
||||
# EOF
|
||||
|
||||
|
||||
- name: show tables sizes and retrieve current backpressure seconds
|
||||
run: |
|
||||
export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
|
||||
${PSQL} "${NEW_PROJECT_CONNSTR}" -c "\dt+"
|
||||
BACKPRESSURE_TIME_AFTER_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
|
||||
echo "BACKPRESSURE_TIME_AFTER_INGEST=${BACKPRESSURE_TIME_AFTER_INGEST}" >> $GITHUB_ENV
|
||||
|
||||
- name: Parse pgcopydb log and report performance metrics
|
||||
env:
|
||||
PERF_TEST_RESULT_CONNSTR: ${{ secrets.PERF_TEST_RESULT_CONNSTR }}
|
||||
run: |
|
||||
export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
|
||||
|
||||
# Define the log file path
|
||||
LOG_FILE="/tmp/pgcopydb_${{ matrix.target_project }}.log"
|
||||
|
||||
# Get the current git commit hash
|
||||
git config --global --add safe.directory /__w/neon/neon
|
||||
COMMIT_HASH=$(git rev-parse --short HEAD)
|
||||
|
||||
# Define the platform and test suite
|
||||
PLATFORM="pg16-${{ matrix.target_project }}-us-east-2-staging"
|
||||
SUIT="pgcopydb_ingest_bench"
|
||||
|
||||
# Function to convert time (e.g., "2h47m", "4h36m", "118ms", "8s493") to seconds
|
||||
convert_to_seconds() {
|
||||
local duration=$1
|
||||
local total_seconds=0
|
||||
|
||||
# Check for hours (h)
|
||||
if [[ "$duration" =~ ([0-9]+)h ]]; then
|
||||
total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 3600))
|
||||
fi
|
||||
|
||||
# Check for seconds (s)
|
||||
if [[ "$duration" =~ ([0-9]+)s ]]; then
|
||||
total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0}))
|
||||
fi
|
||||
|
||||
# Check for milliseconds (ms) (if applicable)
|
||||
if [[ "$duration" =~ ([0-9]+)ms ]]; then
|
||||
total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} / 1000))
|
||||
duration=${duration/${BASH_REMATCH[0]}/} # need to remove it to avoid double counting with m
|
||||
fi
|
||||
|
||||
# Check for minutes (m) - must be checked after ms because m is contained in ms
|
||||
if [[ "$duration" =~ ([0-9]+)m ]]; then
|
||||
total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 60))
|
||||
fi
|
||||
|
||||
echo $total_seconds
|
||||
}
|
||||
|
||||
# Calculate the backpressure difference in seconds
|
||||
BACKPRESSURE_TIME_DIFF=$(awk "BEGIN {print $BACKPRESSURE_TIME_AFTER_INGEST - $BACKPRESSURE_TIME_BEFORE_INGEST}")
|
||||
|
||||
# Insert the backpressure time difference into the performance database
|
||||
if [ -n "$BACKPRESSURE_TIME_DIFF" ]; then
|
||||
PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \"
|
||||
INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp)
|
||||
VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', 'backpressure_time', ${BACKPRESSURE_TIME_DIFF}, 'seconds', 'lower_is_better', now());
|
||||
\""
|
||||
echo "Inserting backpressure time difference: ${BACKPRESSURE_TIME_DIFF} seconds"
|
||||
eval $PSQL_CMD
|
||||
fi
|
||||
|
||||
# Extract and process log lines
|
||||
while IFS= read -r line; do
|
||||
METRIC_NAME=""
|
||||
# Match each desired line and extract the relevant information
|
||||
if [[ "$line" =~ COPY,\ INDEX,\ CONSTRAINTS,\ VACUUM.* ]]; then
|
||||
METRIC_NAME="COPY, INDEX, CONSTRAINTS, VACUUM (wall clock)"
|
||||
elif [[ "$line" =~ COPY\ \(cumulative\).* ]]; then
|
||||
METRIC_NAME="COPY (cumulative)"
|
||||
elif [[ "$line" =~ CREATE\ INDEX\ \(cumulative\).* ]]; then
|
||||
METRIC_NAME="CREATE INDEX (cumulative)"
|
||||
elif [[ "$line" =~ CONSTRAINTS\ \(cumulative\).* ]]; then
|
||||
METRIC_NAME="CONSTRAINTS (cumulative)"
|
||||
elif [[ "$line" =~ Finalize\ Schema.* ]]; then
|
||||
METRIC_NAME="Finalize Schema"
|
||||
elif [[ "$line" =~ Total\ Wall\ Clock\ Duration.* ]]; then
|
||||
METRIC_NAME="Total Wall Clock Duration"
|
||||
fi
|
||||
|
||||
# If a metric was matched, insert it into the performance database
|
||||
if [ -n "$METRIC_NAME" ]; then
|
||||
DURATION=$(echo "$line" | grep -oP '\d+h\d+m|\d+s|\d+ms|\d{1,2}h\d{1,2}m|\d+\.\d+s' | head -n 1)
|
||||
METRIC_VALUE=$(convert_to_seconds "$DURATION")
|
||||
PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \"
|
||||
INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp)
|
||||
VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', '${METRIC_NAME}', ${METRIC_VALUE}, 'seconds', 'lower_is_better', now());
|
||||
\""
|
||||
echo "Inserting ${METRIC_NAME} with value ${METRIC_VALUE} seconds"
|
||||
eval $PSQL_CMD
|
||||
fi
|
||||
done < "$LOG_FILE"
|
||||
|
||||
- name: Delete Neon Project
|
||||
if: ${{ always() && matrix.target_project == 'new_empty_project' }}
|
||||
uses: ./.github/actions/neon-project-delete
|
||||
with:
|
||||
project_id: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
|
||||
- name: Delete Neon Branch for large tenant
|
||||
if: ${{ always() && matrix.target_project == 'large_existing_project' }}
|
||||
uses: ./.github/actions/neon-branch-delete
|
||||
with:
|
||||
project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }}
|
||||
branch_id: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }}
|
||||
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
|
||||
2
.github/workflows/neon_extra_builds.yml
vendored
2
.github/workflows/neon_extra_builds.yml
vendored
@@ -201,8 +201,6 @@ jobs:
|
||||
REPORT_URL: ${{ steps.upload-stats.outputs.report-url }}
|
||||
SHA: ${{ github.event.pull_request.head.sha || github.sha }}
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
script: |
|
||||
const { REPORT_URL, SHA } = process.env
|
||||
|
||||
|
||||
94
.github/workflows/pre-merge-checks.yml
vendored
94
.github/workflows/pre-merge-checks.yml
vendored
@@ -1,94 +0,0 @@
|
||||
name: Pre-merge checks
|
||||
|
||||
on:
|
||||
merge_group:
|
||||
branches:
|
||||
- main
|
||||
|
||||
defaults:
|
||||
run:
|
||||
shell: bash -euxo pipefail {0}
|
||||
|
||||
# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
get-changed-files:
|
||||
runs-on: ubuntu-22.04
|
||||
outputs:
|
||||
python-changed: ${{ steps.python-src.outputs.any_changed }}
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
|
||||
id: python-src
|
||||
with:
|
||||
files: |
|
||||
.github/workflows/pre-merge-checks.yml
|
||||
**/**.py
|
||||
poetry.lock
|
||||
pyproject.toml
|
||||
|
||||
- name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES
|
||||
env:
|
||||
PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }}
|
||||
run: |
|
||||
echo "${PYTHON_CHANGED_FILES}"
|
||||
|
||||
check-build-tools-image:
|
||||
if: needs.get-changed-files.outputs.python-changed == 'true'
|
||||
needs: [ get-changed-files ]
|
||||
uses: ./.github/workflows/check-build-tools-image.yml
|
||||
|
||||
build-build-tools-image:
|
||||
needs: [ check-build-tools-image ]
|
||||
uses: ./.github/workflows/build-build-tools-image.yml
|
||||
with:
|
||||
image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
|
||||
secrets: inherit
|
||||
|
||||
check-codestyle-python:
|
||||
if: needs.get-changed-files.outputs.python-changed == 'true'
|
||||
needs: [ get-changed-files, build-build-tools-image ]
|
||||
uses: ./.github/workflows/_check-codestyle-python.yml
|
||||
with:
|
||||
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
|
||||
secrets: inherit
|
||||
|
||||
# To get items from the merge queue merged into main we need to satisfy "Status checks that are required".
|
||||
# Currently we require 2 jobs (checks with exact name):
|
||||
# - conclusion
|
||||
# - neon-cloud-e2e
|
||||
conclusion:
|
||||
if: always()
|
||||
permissions:
|
||||
statuses: write # for `github.repos.createCommitStatus(...)`
|
||||
needs:
|
||||
- get-changed-files
|
||||
- check-codestyle-python
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Create fake `neon-cloud-e2e` check
|
||||
uses: actions/github-script@v7
|
||||
with:
|
||||
# Retry script for 5XX server errors: https://github.com/actions/github-script#retries
|
||||
retries: 5
|
||||
script: |
|
||||
const { repo, owner } = context.repo;
|
||||
const targetUrl = `${context.serverUrl}/${owner}/${repo}/actions/runs/${context.runId}`;
|
||||
|
||||
await github.rest.repos.createCommitStatus({
|
||||
owner: owner,
|
||||
repo: repo,
|
||||
sha: context.sha,
|
||||
context: `neon-cloud-e2e`,
|
||||
state: `success`,
|
||||
target_url: targetUrl,
|
||||
description: `fake check for merge queue`,
|
||||
});
|
||||
|
||||
- name: Fail the job if any of the dependencies do not succeed or skipped
|
||||
run: exit 1
|
||||
if: |
|
||||
(contains(needs.check-codestyle-python.result, 'skipped') && needs.get-changed-files.outputs.python-changed == 'true')
|
||||
|| contains(needs.*.result, 'failure')
|
||||
|| contains(needs.*.result, 'cancelled')
|
||||
@@ -1,29 +0,0 @@
|
||||
name: Report Workflow Stats Batch
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '*/15 * * * *'
|
||||
- cron: '25 0 * * *'
|
||||
|
||||
jobs:
|
||||
gh-workflow-stats-batch:
|
||||
name: GitHub Workflow Stats Batch
|
||||
runs-on: ubuntu-22.04
|
||||
permissions:
|
||||
actions: read
|
||||
steps:
|
||||
- name: Export Workflow Run for the past 2 hours
|
||||
uses: neondatabase/gh-workflow-stats-action@v0.2.1
|
||||
with:
|
||||
db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
|
||||
db_table: "gh_workflow_stats_batch_neon"
|
||||
gh_token: ${{ secrets.GITHUB_TOKEN }}
|
||||
duration: '2h'
|
||||
- name: Export Workflow Run for the past 24 hours
|
||||
if: github.event.schedule == '25 0 * * *'
|
||||
uses: neondatabase/gh-workflow-stats-action@v0.2.1
|
||||
with:
|
||||
db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
|
||||
db_table: "gh_workflow_stats_batch_neon"
|
||||
gh_token: ${{ secrets.GITHUB_TOKEN }}
|
||||
duration: '24h'
|
||||
1
.github/workflows/report-workflow-stats.yml
vendored
1
.github/workflows/report-workflow-stats.yml
vendored
@@ -23,7 +23,6 @@ on:
|
||||
- Test Postgres client libraries
|
||||
- Trigger E2E Tests
|
||||
- cleanup caches by a branch
|
||||
- Pre-merge checks
|
||||
types: [completed]
|
||||
|
||||
jobs:
|
||||
|
||||
178
Cargo.lock
generated
178
Cargo.lock
generated
@@ -310,6 +310,33 @@ dependencies = [
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-lc-rs"
|
||||
version = "1.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2f95446d919226d587817a7d21379e6eb099b97b45110a7f272a444ca5c54070"
|
||||
dependencies = [
|
||||
"aws-lc-sys",
|
||||
"mirai-annotations",
|
||||
"paste",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-lc-sys"
|
||||
version = "0.21.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b3ddc4a5b231dd6958b140ff3151b6412b3f4321fab354f399eec8f14b06df62"
|
||||
dependencies = [
|
||||
"bindgen 0.69.5",
|
||||
"cc",
|
||||
"cmake",
|
||||
"dunce",
|
||||
"fs_extra",
|
||||
"libc",
|
||||
"paste",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aws-runtime"
|
||||
version = "1.4.3"
|
||||
@@ -915,6 +942,29 @@ dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bindgen"
|
||||
version = "0.69.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
|
||||
dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"cexpr",
|
||||
"clang-sys",
|
||||
"itertools 0.10.5",
|
||||
"lazy_static",
|
||||
"lazycell",
|
||||
"log",
|
||||
"prettyplease",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"regex",
|
||||
"rustc-hash",
|
||||
"shlex",
|
||||
"syn 2.0.52",
|
||||
"which",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bindgen"
|
||||
version = "0.70.1"
|
||||
@@ -924,7 +974,7 @@ dependencies = [
|
||||
"bitflags 2.4.1",
|
||||
"cexpr",
|
||||
"clang-sys",
|
||||
"itertools 0.12.1",
|
||||
"itertools 0.10.5",
|
||||
"log",
|
||||
"prettyplease",
|
||||
"proc-macro2",
|
||||
@@ -1170,6 +1220,15 @@ version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
|
||||
|
||||
[[package]]
|
||||
name = "cmake"
|
||||
version = "0.1.51"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a"
|
||||
dependencies = [
|
||||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "colorchoice"
|
||||
version = "1.0.0"
|
||||
@@ -1270,9 +1329,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "const-oid"
|
||||
version = "0.9.6"
|
||||
version = "0.9.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
|
||||
checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f"
|
||||
|
||||
[[package]]
|
||||
name = "const-random"
|
||||
@@ -1756,6 +1815,12 @@ dependencies = [
|
||||
"syn 2.0.52",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dunce"
|
||||
version = "1.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
|
||||
|
||||
[[package]]
|
||||
name = "dyn-clone"
|
||||
version = "1.0.14"
|
||||
@@ -2060,6 +2125,12 @@ dependencies = [
|
||||
"tokio-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fs_extra"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
|
||||
|
||||
[[package]]
|
||||
name = "fsevent-sys"
|
||||
version = "4.1.0"
|
||||
@@ -2413,6 +2484,15 @@ dependencies = [
|
||||
"digest",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "home"
|
||||
version = "0.5.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
|
||||
dependencies = [
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hostname"
|
||||
version = "0.4.0"
|
||||
@@ -2908,6 +2988,12 @@ dependencies = [
|
||||
"spin",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lazycell"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.150"
|
||||
@@ -3138,6 +3224,12 @@ dependencies = [
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mirai-annotations"
|
||||
version = "1.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1"
|
||||
|
||||
[[package]]
|
||||
name = "multimap"
|
||||
version = "0.8.3"
|
||||
@@ -3657,7 +3749,6 @@ dependencies = [
|
||||
"tracing",
|
||||
"url",
|
||||
"utils",
|
||||
"wal_decoder",
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
]
|
||||
@@ -4055,7 +4146,7 @@ dependencies = [
|
||||
"bytes",
|
||||
"once_cell",
|
||||
"pq_proto",
|
||||
"rustls 0.23.16",
|
||||
"rustls 0.23.7",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"serde",
|
||||
"thiserror",
|
||||
@@ -4084,7 +4175,7 @@ name = "postgres_ffi"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bindgen",
|
||||
"bindgen 0.70.1",
|
||||
"bytes",
|
||||
"crc32c",
|
||||
"env_logger",
|
||||
@@ -4095,7 +4186,6 @@ dependencies = [
|
||||
"regex",
|
||||
"serde",
|
||||
"thiserror",
|
||||
"tracing",
|
||||
"utils",
|
||||
]
|
||||
|
||||
@@ -4222,7 +4312,7 @@ checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"heck 0.5.0",
|
||||
"itertools 0.12.1",
|
||||
"itertools 0.10.5",
|
||||
"log",
|
||||
"multimap",
|
||||
"once_cell",
|
||||
@@ -4242,7 +4332,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"itertools 0.12.1",
|
||||
"itertools 0.10.5",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.52",
|
||||
@@ -4330,7 +4420,7 @@ dependencies = [
|
||||
"rsa",
|
||||
"rstest",
|
||||
"rustc-hash",
|
||||
"rustls 0.23.16",
|
||||
"rustls 0.23.7",
|
||||
"rustls-native-certs 0.8.0",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"scopeguard",
|
||||
@@ -4341,8 +4431,6 @@ dependencies = [
|
||||
"smallvec",
|
||||
"smol_str",
|
||||
"socket2",
|
||||
"strum",
|
||||
"strum_macros",
|
||||
"subtle",
|
||||
"thiserror",
|
||||
"tikv-jemalloc-ctl",
|
||||
@@ -4365,7 +4453,6 @@ dependencies = [
|
||||
"walkdir",
|
||||
"workspace_hack",
|
||||
"x509-parser",
|
||||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4743,7 +4830,6 @@ dependencies = [
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustls 0.22.4",
|
||||
"rustls-native-certs 0.7.0",
|
||||
"rustls-pemfile 2.1.1",
|
||||
"rustls-pki-types",
|
||||
"serde",
|
||||
@@ -5018,22 +5104,23 @@ dependencies = [
|
||||
"log",
|
||||
"ring",
|
||||
"rustls-pki-types",
|
||||
"rustls-webpki 0.102.8",
|
||||
"rustls-webpki 0.102.2",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls"
|
||||
version = "0.23.16"
|
||||
version = "0.23.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e"
|
||||
checksum = "ebbbdb961df0ad3f2652da8f3fdc4b36122f568f968f45ad3316f26c025c677b"
|
||||
dependencies = [
|
||||
"aws-lc-rs",
|
||||
"log",
|
||||
"once_cell",
|
||||
"ring",
|
||||
"rustls-pki-types",
|
||||
"rustls-webpki 0.102.8",
|
||||
"rustls-webpki 0.102.2",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
@@ -5113,10 +5200,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rustls-webpki"
|
||||
version = "0.102.8"
|
||||
version = "0.102.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
|
||||
checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
|
||||
dependencies = [
|
||||
"aws-lc-rs",
|
||||
"ring",
|
||||
"rustls-pki-types",
|
||||
"untrusted",
|
||||
@@ -5147,7 +5235,6 @@ dependencies = [
|
||||
"chrono",
|
||||
"clap",
|
||||
"crc32c",
|
||||
"criterion",
|
||||
"desim",
|
||||
"fail",
|
||||
"futures",
|
||||
@@ -5155,7 +5242,6 @@ dependencies = [
|
||||
"http 1.1.0",
|
||||
"humantime",
|
||||
"hyper 0.14.30",
|
||||
"itertools 0.10.5",
|
||||
"metrics",
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
@@ -5735,7 +5821,6 @@ dependencies = [
|
||||
"once_cell",
|
||||
"parking_lot 0.12.1",
|
||||
"prost",
|
||||
"rustls 0.23.16",
|
||||
"tokio",
|
||||
"tonic",
|
||||
"tonic-build",
|
||||
@@ -5818,7 +5903,7 @@ dependencies = [
|
||||
"postgres_ffi",
|
||||
"remote_storage",
|
||||
"reqwest 0.12.4",
|
||||
"rustls 0.23.16",
|
||||
"rustls 0.23.7",
|
||||
"rustls-native-certs 0.8.0",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -6187,7 +6272,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "tokio-epoll-uring"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
|
||||
dependencies = [
|
||||
"futures",
|
||||
"nix 0.26.4",
|
||||
@@ -6251,7 +6336,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab"
|
||||
dependencies = [
|
||||
"ring",
|
||||
"rustls 0.23.16",
|
||||
"rustls 0.23.7",
|
||||
"tokio",
|
||||
"tokio-postgres",
|
||||
"tokio-rustls 0.26.0",
|
||||
@@ -6285,7 +6370,7 @@ version = "0.26.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4"
|
||||
dependencies = [
|
||||
"rustls 0.23.16",
|
||||
"rustls 0.23.7",
|
||||
"rustls-pki-types",
|
||||
"tokio",
|
||||
]
|
||||
@@ -6694,7 +6779,7 @@ dependencies = [
|
||||
"base64 0.22.1",
|
||||
"log",
|
||||
"once_cell",
|
||||
"rustls 0.23.16",
|
||||
"rustls 0.23.7",
|
||||
"rustls-pki-types",
|
||||
"url",
|
||||
"webpki-roots 0.26.1",
|
||||
@@ -6703,7 +6788,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "uring-common"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
|
||||
source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"io-uring",
|
||||
@@ -6869,20 +6954,6 @@ dependencies = [
|
||||
"utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wal_decoder"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
"pageserver_api",
|
||||
"postgres_ffi",
|
||||
"serde",
|
||||
"tracing",
|
||||
"utils",
|
||||
"workspace_hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "walkdir"
|
||||
version = "2.3.3"
|
||||
@@ -6898,7 +6969,7 @@ name = "walproposer"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bindgen",
|
||||
"bindgen 0.70.1",
|
||||
"postgres_ffi",
|
||||
"utils",
|
||||
]
|
||||
@@ -7073,6 +7144,18 @@ dependencies = [
|
||||
"rustls-pki-types",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "which"
|
||||
version = "4.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
|
||||
dependencies = [
|
||||
"either",
|
||||
"home",
|
||||
"once_cell",
|
||||
"rustix",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "whoami"
|
||||
version = "1.5.1"
|
||||
@@ -7332,7 +7415,7 @@ dependencies = [
|
||||
"hyper-util",
|
||||
"indexmap 1.9.3",
|
||||
"indexmap 2.0.1",
|
||||
"itertools 0.12.1",
|
||||
"itertools 0.10.5",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"log",
|
||||
@@ -7353,7 +7436,8 @@ dependencies = [
|
||||
"regex-automata 0.4.3",
|
||||
"regex-syntax 0.8.2",
|
||||
"reqwest 0.12.4",
|
||||
"rustls 0.23.16",
|
||||
"rustls 0.23.7",
|
||||
"rustls-webpki 0.102.2",
|
||||
"scopeguard",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@@ -7378,7 +7462,6 @@ dependencies = [
|
||||
"tracing",
|
||||
"tracing-core",
|
||||
"url",
|
||||
"zerocopy",
|
||||
"zeroize",
|
||||
"zstd",
|
||||
"zstd-safe",
|
||||
@@ -7451,7 +7534,6 @@ version = "0.7.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1c4061bedbb353041c12f413700357bec76df2c7e2ca8e4df8bac24c6bf68e3d"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"zerocopy-derive",
|
||||
]
|
||||
|
||||
|
||||
@@ -33,7 +33,6 @@ members = [
|
||||
"libs/postgres_ffi/wal_craft",
|
||||
"libs/vm_monitor",
|
||||
"libs/walproposer",
|
||||
"libs/wal_decoder",
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
@@ -143,7 +142,7 @@ reqwest-retry = "0.5"
|
||||
routerify = "3"
|
||||
rpds = "0.13"
|
||||
rustc-hash = "1.1.0"
|
||||
rustls = { version = "0.23.16", default-features = false }
|
||||
rustls = "0.23"
|
||||
rustls-pemfile = "2"
|
||||
scopeguard = "1.1"
|
||||
sysinfo = "0.29.2"
|
||||
@@ -174,7 +173,7 @@ tokio = { version = "1.17", features = ["macros"] }
|
||||
tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
|
||||
tokio-io-timeout = "1.2.0"
|
||||
tokio-postgres-rustls = "0.12.0"
|
||||
tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
|
||||
tokio-rustls = "0.26"
|
||||
tokio-stream = "0.1"
|
||||
tokio-tar = "0.3"
|
||||
tokio-util = { version = "0.7.10", features = ["io", "rt"] }
|
||||
@@ -196,7 +195,6 @@ walkdir = "2.3.2"
|
||||
rustls-native-certs = "0.8"
|
||||
x509-parser = "0.16"
|
||||
whoami = "1.5.1"
|
||||
zerocopy = { version = "0.7", features = ["derive"] }
|
||||
|
||||
## TODO replace this with tracing
|
||||
env_logger = "0.10"
|
||||
@@ -240,7 +238,6 @@ tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
|
||||
utils = { version = "0.1", path = "./libs/utils/" }
|
||||
vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
|
||||
walproposer = { version = "0.1", path = "./libs/walproposer/" }
|
||||
wal_decoder = { version = "0.1", path = "./libs/wal_decoder" }
|
||||
|
||||
## Common library dependency
|
||||
workspace_hack = { version = "0.1", path = "./workspace_hack/" }
|
||||
|
||||
@@ -1,66 +1,12 @@
|
||||
ARG DEBIAN_VERSION=bullseye
|
||||
|
||||
FROM debian:bookworm-slim AS pgcopydb_builder
|
||||
ARG DEBIAN_VERSION
|
||||
|
||||
RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
|
||||
set -e && \
|
||||
apt update && \
|
||||
apt install -y --no-install-recommends \
|
||||
ca-certificates wget gpg && \
|
||||
wget -qO - https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor -o /usr/share/keyrings/postgresql-keyring.gpg && \
|
||||
echo "deb [signed-by=/usr/share/keyrings/postgresql-keyring.gpg] http://apt.postgresql.org/pub/repos/apt bookworm-pgdg main" > /etc/apt/sources.list.d/pgdg.list && \
|
||||
apt-get update && \
|
||||
apt install -y --no-install-recommends \
|
||||
build-essential \
|
||||
autotools-dev \
|
||||
libedit-dev \
|
||||
libgc-dev \
|
||||
libpam0g-dev \
|
||||
libreadline-dev \
|
||||
libselinux1-dev \
|
||||
libxslt1-dev \
|
||||
libssl-dev \
|
||||
libkrb5-dev \
|
||||
zlib1g-dev \
|
||||
liblz4-dev \
|
||||
libpq5 \
|
||||
libpq-dev \
|
||||
libzstd-dev \
|
||||
postgresql-16 \
|
||||
postgresql-server-dev-16 \
|
||||
postgresql-common \
|
||||
python3-sphinx && \
|
||||
wget -O /tmp/pgcopydb.tar.gz https://github.com/dimitri/pgcopydb/archive/refs/tags/v0.17.tar.gz && \
|
||||
mkdir /tmp/pgcopydb && \
|
||||
tar -xzf /tmp/pgcopydb.tar.gz -C /tmp/pgcopydb --strip-components=1 && \
|
||||
cd /tmp/pgcopydb && \
|
||||
make -s clean && \
|
||||
make -s -j12 install && \
|
||||
libpq_path=$(find /lib /usr/lib -name "libpq.so.5" | head -n 1) && \
|
||||
mkdir -p /pgcopydb/lib && \
|
||||
cp "$libpq_path" /pgcopydb/lib/; \
|
||||
else \
|
||||
# copy command below will fail if we don't have dummy files, so we create them for other debian versions
|
||||
mkdir -p /usr/lib/postgresql/16/bin && touch /usr/lib/postgresql/16/bin/pgcopydb && \
|
||||
mkdir -p mkdir -p /pgcopydb/lib && touch /pgcopydb/lib/libpq.so.5; \
|
||||
fi
|
||||
|
||||
FROM debian:${DEBIAN_VERSION}-slim AS build_tools
|
||||
FROM debian:${DEBIAN_VERSION}-slim
|
||||
ARG DEBIAN_VERSION
|
||||
|
||||
# Add nonroot user
|
||||
RUN useradd -ms /bin/bash nonroot -b /home
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
RUN mkdir -p /pgcopydb/bin && \
|
||||
mkdir -p /pgcopydb/lib && \
|
||||
chmod -R 755 /pgcopydb && \
|
||||
chown -R nonroot:nonroot /pgcopydb
|
||||
|
||||
COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb
|
||||
COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5
|
||||
|
||||
# System deps
|
||||
#
|
||||
# 'gdb' is included so that we get backtraces of core dumps produced in
|
||||
@@ -92,7 +38,7 @@ RUN set -e \
|
||||
libseccomp-dev \
|
||||
libsqlite3-dev \
|
||||
libssl-dev \
|
||||
$([[ "${DEBIAN_VERSION}" = "bullseye" ]] && echo libstdc++-10-dev || echo libstdc++-11-dev) \
|
||||
$([[ "${DEBIAN_VERSION}" = "bullseye" ]] && libstdc++-10-dev || libstdc++-11-dev) \
|
||||
libtool \
|
||||
libxml2-dev \
|
||||
libxmlsec1-dev \
|
||||
@@ -111,18 +57,6 @@ RUN set -e \
|
||||
zstd \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
# sql_exporter
|
||||
|
||||
# Keep the version the same as in compute/compute-node.Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py.
|
||||
ENV SQL_EXPORTER_VERSION=0.13.1
|
||||
RUN curl -fsSL \
|
||||
"https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
|
||||
--output sql_exporter.tar.gz \
|
||||
&& mkdir /tmp/sql_exporter \
|
||||
&& tar xzvf sql_exporter.tar.gz -C /tmp/sql_exporter --strip-components=1 \
|
||||
&& mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter
|
||||
|
||||
# protobuf-compiler (protoc)
|
||||
ENV PROTOC_VERSION=25.1
|
||||
RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
|
||||
@@ -289,13 +223,7 @@ RUN whoami \
|
||||
&& cargo --version --verbose \
|
||||
&& rustup --version --verbose \
|
||||
&& rustc --version --verbose \
|
||||
&& clang --version
|
||||
|
||||
RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
|
||||
LD_LIBRARY_PATH=/pgcopydb/lib /pgcopydb/bin/pgcopydb --version; \
|
||||
else \
|
||||
echo "pgcopydb is not available for ${DEBIAN_VERSION}"; \
|
||||
fi
|
||||
&& clang --version
|
||||
|
||||
# Set following flag to check in Makefile if its running in Docker
|
||||
RUN touch /home/nonroot/.docker_build
|
||||
|
||||
@@ -22,7 +22,6 @@ sql_exporter.yml: $(jsonnet_files)
|
||||
--output-file etc/$@ \
|
||||
--tla-str collector_name=neon_collector \
|
||||
--tla-str collector_file=neon_collector.yml \
|
||||
--tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter' \
|
||||
etc/sql_exporter.jsonnet
|
||||
|
||||
sql_exporter_autoscaling.yml: $(jsonnet_files)
|
||||
@@ -30,7 +29,7 @@ sql_exporter_autoscaling.yml: $(jsonnet_files)
|
||||
--output-file etc/$@ \
|
||||
--tla-str collector_name=neon_collector_autoscaling \
|
||||
--tla-str collector_file=neon_collector_autoscaling.yml \
|
||||
--tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling' \
|
||||
--tla-str application_name=sql_exporter_autoscaling \
|
||||
etc/sql_exporter.jsonnet
|
||||
|
||||
.PHONY: clean
|
||||
|
||||
@@ -431,11 +431,14 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
COPY compute/patches/rum.patch /rum.patch
|
||||
|
||||
# supports v17 since https://github.com/postgrespro/rum/commit/cb1edffc57736cd2a4455f8d0feab0d69928da25
|
||||
# doesn't use releases since 1.3.13 - Sep 19, 2022
|
||||
# use latest commit from the master branch
|
||||
RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0feab0d69928da25.tar.gz -O rum.tar.gz && \
|
||||
echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \
|
||||
# maybe version-specific
|
||||
# support for v17 is unknown
|
||||
# last release 1.3.13 - Sep 19, 2022
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
|
||||
esac && \
|
||||
wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
|
||||
echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
|
||||
mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
|
||||
patch -p1 < /rum.patch && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
|
||||
@@ -559,8 +562,8 @@ RUN case "${PG_VERSION}" in \
|
||||
export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
|
||||
;; \
|
||||
"v17") \
|
||||
export TIMESCALEDB_VERSION=2.17.1 \
|
||||
export TIMESCALEDB_CHECKSUM=6277cf43f5695e23dae1c5cfeba00474d730b66ed53665a84b787a6bb1a57e28 \
|
||||
export TIMESCALEDB_VERSION=2.17.0 \
|
||||
export TIMESCALEDB_CHECKSUM=155bf64391d3558c42f31ca0e523cfc6252921974f75298c9039ccad1c89811a \
|
||||
;; \
|
||||
esac && \
|
||||
wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
|
||||
@@ -624,12 +627,16 @@ FROM build-deps AS pg-cron-pg-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# 1.6.4 available, supports v17
|
||||
# This is an experimental extension that we do not support on prod yet.
|
||||
# !Do not remove!
|
||||
# We set it in shared_preload_libraries and computes will fail to start if library is not found.
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \
|
||||
echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "v17 extensions are not supported yet. Quit" && exit 0;; \
|
||||
esac && \
|
||||
wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
|
||||
echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) && \
|
||||
make -j $(getconf _NPROCESSORS_ONLN) install && \
|
||||
@@ -866,85 +873,6 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
|
||||
|
||||
USER root
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "rust extensions pgrx12"
|
||||
#
|
||||
# pgrx started to support Postgres 17 since version 12,
|
||||
# but some older extension aren't compatible with it.
|
||||
# This layer should be used as a base for new pgrx extensions,
|
||||
# and eventually get merged with `rust-extensions-build`
|
||||
#
|
||||
#########################################################################################
|
||||
FROM build-deps AS rust-extensions-build-pgrx12
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install --no-install-recommends -y curl libclang-dev && \
|
||||
useradd -ms /bin/bash nonroot -b /home
|
||||
|
||||
ENV HOME=/home/nonroot
|
||||
ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
|
||||
USER nonroot
|
||||
WORKDIR /home/nonroot
|
||||
|
||||
RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
|
||||
chmod +x rustup-init && \
|
||||
./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
|
||||
rm rustup-init && \
|
||||
cargo install --locked --version 0.12.6 cargo-pgrx && \
|
||||
/bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
|
||||
|
||||
USER root
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layers "pg-onnx-build" and "pgrag-pg-build"
|
||||
# Compile "pgrag" extensions
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build-pgrx12 AS pg-onnx-build
|
||||
|
||||
# cmake 3.26 or higher is required, so installing it using pip (bullseye-backports has cmake 3.25).
|
||||
# Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise
|
||||
RUN apt-get update && apt-get install -y python3 python3-pip python3-venv && \
|
||||
python3 -m venv venv && \
|
||||
. venv/bin/activate && \
|
||||
python3 -m pip install cmake==3.30.5 && \
|
||||
wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \
|
||||
mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
|
||||
./build.sh --config Release --parallel --skip_submodule_sync --skip_tests --allow_running_as_root
|
||||
|
||||
|
||||
FROM pg-onnx-build AS pgrag-pg-build
|
||||
|
||||
RUN apt-get install -y protobuf-compiler && \
|
||||
wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz && \
|
||||
echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
|
||||
mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \
|
||||
\
|
||||
cd exts/rag && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \
|
||||
\
|
||||
cd ../rag_bge_small_en_v15 && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \
|
||||
\
|
||||
cd ../rag_jina_reranker_v1_tiny_en && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
|
||||
REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
|
||||
cargo pgrx install --release --features remote_onnx && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_jina_reranker_v1_tiny_en.control
|
||||
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
# Layer "pg-jsonschema-pg-build"
|
||||
@@ -952,31 +880,21 @@ RUN apt-get install -y protobuf-compiler && \
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build-pgrx12 AS pg-jsonschema-pg-build
|
||||
FROM rust-extensions-build AS pg-jsonschema-pg-build
|
||||
ARG PG_VERSION
|
||||
# version 0.3.3 supports v17
|
||||
# last release v0.3.3 - Oct 16, 2024
|
||||
#
|
||||
# there were no breaking changes
|
||||
# so we can use the same version for all postgres versions
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15" | "v16" | "v17") \
|
||||
export PG_JSONSCHEMA_VERSION=0.3.3 \
|
||||
export PG_JSONSCHEMA_CHECKSUM=40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac \
|
||||
;; \
|
||||
*) \
|
||||
echo "unexpected PostgreSQL version" && exit 1 \
|
||||
;; \
|
||||
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "pg_jsonschema does not yet have a release that supports pg17" && exit 0;; \
|
||||
esac && \
|
||||
wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v${PG_JSONSCHEMA_VERSION}.tar.gz -O pg_jsonschema.tar.gz && \
|
||||
echo "${PG_JSONSCHEMA_CHECKSUM} pg_jsonschema.tar.gz" | sha256sum --check && \
|
||||
wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \
|
||||
echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
|
||||
# see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
|
||||
# `unsafe-postgres` feature allows to build pgx extensions
|
||||
# against postgres forks that decided to change their ABI name (like us).
|
||||
# With that we can build extensions without forking them and using stock
|
||||
# pgx. As this feature is new few manual version bumps were required.
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
|
||||
|
||||
@@ -987,27 +905,16 @@ RUN case "${PG_VERSION}" in \
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build-pgrx12 AS pg-graphql-pg-build
|
||||
FROM rust-extensions-build AS pg-graphql-pg-build
|
||||
ARG PG_VERSION
|
||||
|
||||
# version 1.5.9 supports v17
|
||||
# last release v1.5.9 - Oct 16, 2024
|
||||
#
|
||||
# there were no breaking changes
|
||||
# so we can use the same version for all postgres versions
|
||||
RUN case "${PG_VERSION}" in \
|
||||
"v14" | "v15" | "v16" | "v17") \
|
||||
export PG_GRAPHQL_VERSION=1.5.9 \
|
||||
export PG_GRAPHQL_CHECKSUM=cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 \
|
||||
;; \
|
||||
*) \
|
||||
echo "unexpected PostgreSQL version" && exit 1 \
|
||||
;; \
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "pg_graphql does not yet have a release that supports pg17 as of now" && exit 0;; \
|
||||
esac && \
|
||||
wget https://github.com/supabase/pg_graphql/archive/refs/tags/v${PG_GRAPHQL_VERSION}.tar.gz -O pg_graphql.tar.gz && \
|
||||
echo "${PG_GRAPHQL_CHECKSUM} pg_graphql.tar.gz" | sha256sum --check && \
|
||||
wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \
|
||||
echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release && \
|
||||
# it's needed to enable extension because it uses untrusted C language
|
||||
sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
|
||||
@@ -1020,13 +927,15 @@ RUN case "${PG_VERSION}" in \
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build-pgrx12 AS pg-tiktoken-pg-build
|
||||
FROM rust-extensions-build AS pg-tiktoken-pg-build
|
||||
ARG PG_VERSION
|
||||
|
||||
# doesn't use releases
|
||||
# 9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7 - on Oct 29, 2024
|
||||
RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7.tar.gz -O pg_tiktoken.tar.gz && \
|
||||
echo "a5bc447e7920ee149d3c064b8b9f0086c0e83939499753178f7d35788416f628 pg_tiktoken.tar.gz" | sha256sum --check && \
|
||||
# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "pg_tiktoken does not have versions, nor support for pg17" && exit 0;; \
|
||||
esac && \
|
||||
wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
|
||||
echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
|
||||
# TODO update pgrx version in the pg_tiktoken repo and remove this line
|
||||
sed -i 's/pgrx = { version = "=0.10.2",/pgrx = { version = "0.11.3",/g' Cargo.toml && \
|
||||
@@ -1044,8 +953,6 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04
|
||||
FROM rust-extensions-build AS pg-pgx-ulid-build
|
||||
ARG PG_VERSION
|
||||
|
||||
# doesn't support v17 yet
|
||||
# https://github.com/pksunkara/pgx_ulid/pull/52
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \
|
||||
esac && \
|
||||
@@ -1063,16 +970,16 @@ RUN case "${PG_VERSION}" in "v17") \
|
||||
#
|
||||
#########################################################################################
|
||||
|
||||
FROM rust-extensions-build-pgrx12 AS pg-session-jwt-build
|
||||
FROM rust-extensions-build AS pg-session-jwt-build
|
||||
ARG PG_VERSION
|
||||
|
||||
# NOTE: local_proxy depends on the version of pg_session_jwt
|
||||
# Do not update without approve from proxy team
|
||||
# Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
|
||||
RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2-v17.tar.gz -O pg_session_jwt.tar.gz && \
|
||||
echo "c8ecbed9cb8c6441bce5134a176002b043018adf9d05a08e457dda233090a86e pg_session_jwt.tar.gz" | sha256sum --check && \
|
||||
RUN case "${PG_VERSION}" in "v17") \
|
||||
echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
|
||||
esac && \
|
||||
wget https://github.com/neondatabase/pg_session_jwt/archive/e1310b08ba51377a19e0559e4d1194883b9b2ba2.tar.gz -O pg_session_jwt.tar.gz && \
|
||||
echo "837932a077888d5545fd54b0abcc79e5f8e37017c2769a930afc2f5c94df6f4e pg_session_jwt.tar.gz" | sha256sum --check && \
|
||||
mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
|
||||
sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
|
||||
cargo pgrx install --release
|
||||
|
||||
#########################################################################################
|
||||
@@ -1145,10 +1052,7 @@ FROM rust-extensions-build AS pg-mooncake-build
|
||||
ARG PG_VERSION
|
||||
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
|
||||
# The topmost commit in the `neon` branch at the time of writing this
|
||||
# https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
|
||||
# https://github.com/Mooncake-Labs/pg_mooncake/commit/077c92c452bb6896a7b7776ee95f039984f076af
|
||||
ENV PG_MOONCAKE_VERSION=077c92c452bb6896a7b7776ee95f039984f076af
|
||||
ENV PG_MOONCAKE_VERSION=0a7de4c0b5c7b1a5e2175e1c5f4625b97b7346f1
|
||||
ENV PATH="/usr/local/pgsql/bin/:$PATH"
|
||||
|
||||
RUN case "${PG_VERSION}" in \
|
||||
@@ -1181,7 +1085,6 @@ COPY --from=h3-pg-build /h3/usr /
|
||||
COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
@@ -1315,10 +1218,7 @@ RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin l
|
||||
#########################################################################################
|
||||
|
||||
FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
|
||||
|
||||
# Keep the version the same as in build-tools.Dockerfile and
|
||||
# test_runner/regress/test_compute_metrics.py.
|
||||
FROM burningalchemist/sql_exporter:0.13.1 AS sql-exporter
|
||||
FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
|
||||
|
||||
#########################################################################################
|
||||
#
|
||||
@@ -1374,7 +1274,6 @@ COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
|
||||
COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
|
||||
COPY --from=vector-pg-build /pgvector.patch /ext-src/
|
||||
COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
|
||||
#COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
|
||||
#COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
|
||||
#COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
|
||||
#COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src
|
||||
@@ -1471,8 +1370,6 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
|
||||
COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
|
||||
COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter
|
||||
|
||||
COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml
|
||||
|
||||
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml /etc/sql_exporter.yml
|
||||
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml /etc/neon_collector.yml
|
||||
COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml /etc/sql_exporter_autoscaling.yml
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
metrics: [
|
||||
import 'sql_exporter/checkpoints_req.libsonnet',
|
||||
import 'sql_exporter/checkpoints_timed.libsonnet',
|
||||
import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet',
|
||||
import 'sql_exporter/compute_current_lsn.libsonnet',
|
||||
import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
|
||||
import 'sql_exporter/compute_receive_lsn.libsonnet',
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
function(collector_name, collector_file, connection_string) {
|
||||
function(collector_name, collector_file, application_name='sql_exporter') {
|
||||
// Configuration for sql_exporter for autoscaling-agent
|
||||
// Global defaults.
|
||||
global: {
|
||||
@@ -23,7 +23,7 @@ function(collector_name, collector_file, connection_string) {
|
||||
target: {
|
||||
// Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
|
||||
// the schema gets dropped or replaced to match the driver expected DSN format.
|
||||
data_source_name: connection_string,
|
||||
data_source_name: std.format('postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=%s', [application_name]),
|
||||
|
||||
// Collectors (referenced by name) to execute on the target.
|
||||
// Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
|
||||
|
||||
@@ -1,10 +0,0 @@
|
||||
{
|
||||
metric_name: 'compute_backpressure_throttling_seconds',
|
||||
type: 'gauge',
|
||||
help: 'Time compute has spent throttled',
|
||||
key_labels: null,
|
||||
values: [
|
||||
'throttled',
|
||||
],
|
||||
query: importstr 'sql_exporter/compute_backpressure_throttling_seconds.sql',
|
||||
}
|
||||
@@ -1 +0,0 @@
|
||||
SELECT neon.backpressure_throttling_time()::float8 / 1000 AS throttled;
|
||||
@@ -26,7 +26,7 @@ commands:
|
||||
- name: postgres-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
|
||||
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
|
||||
- name: sql-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
|
||||
@@ -26,7 +26,7 @@ commands:
|
||||
- name: postgres-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
|
||||
shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
|
||||
- name: sql-exporter
|
||||
user: nobody
|
||||
sysvInitAction: respawn
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::io::BufRead;
|
||||
use std::os::unix::fs::{symlink, PermissionsExt};
|
||||
use std::path::Path;
|
||||
use std::process::{Command, Stdio};
|
||||
@@ -364,43 +365,48 @@ impl ComputeNode {
|
||||
let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
|
||||
|
||||
let basebackup_cmd = match lsn {
|
||||
Lsn(0) => {
|
||||
if spec.spec.mode != ComputeMode::Primary {
|
||||
format!(
|
||||
"basebackup {} {} --gzip --replica",
|
||||
spec.tenant_id, spec.timeline_id
|
||||
)
|
||||
} else {
|
||||
format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id)
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
if spec.spec.mode != ComputeMode::Primary {
|
||||
format!(
|
||||
"basebackup {} {} {} --gzip --replica",
|
||||
spec.tenant_id, spec.timeline_id, lsn
|
||||
)
|
||||
} else {
|
||||
format!(
|
||||
"basebackup {} {} {} --gzip",
|
||||
spec.tenant_id, spec.timeline_id, lsn
|
||||
)
|
||||
}
|
||||
}
|
||||
// HACK We don't use compression on first start (Lsn(0)) because there's no API for it
|
||||
Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id),
|
||||
_ => format!(
|
||||
"basebackup {} {} {} --gzip",
|
||||
spec.tenant_id, spec.timeline_id, lsn
|
||||
),
|
||||
};
|
||||
|
||||
let copyreader = client.copy_out(basebackup_cmd.as_str())?;
|
||||
let mut measured_reader = MeasuredReader::new(copyreader);
|
||||
|
||||
// Check the magic number to see if it's a gzip or not. Even though
|
||||
// we might explicitly ask for gzip, an old pageserver with no implementation
|
||||
// of gzip compression might send us uncompressed data. After some time
|
||||
// passes we can assume all pageservers know how to compress and we can
|
||||
// delete this check.
|
||||
//
|
||||
// If the data is not gzip, it will be tar. It will not be mistakenly
|
||||
// recognized as gzip because tar starts with an ascii encoding of a filename,
|
||||
// and 0x1f and 0x8b are unlikely first characters for any filename. Moreover,
|
||||
// we send the "global" directory first from the pageserver, so it definitely
|
||||
// won't be recognized as gzip.
|
||||
let mut bufreader = std::io::BufReader::new(&mut measured_reader);
|
||||
let gzip = {
|
||||
let peek = bufreader.fill_buf().unwrap();
|
||||
peek[0] == 0x1f && peek[1] == 0x8b
|
||||
};
|
||||
|
||||
// Read the archive directly from the `CopyOutReader`
|
||||
//
|
||||
// Set `ignore_zeros` so that unpack() reads all the Copy data and
|
||||
// doesn't stop at the end-of-archive marker. Otherwise, if the server
|
||||
// sends an Error after finishing the tarball, we will not notice it.
|
||||
let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.pgdata)?;
|
||||
if gzip {
|
||||
let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.pgdata)?;
|
||||
} else {
|
||||
let mut ar = tar::Archive::new(&mut bufreader);
|
||||
ar.set_ignore_zeros(true);
|
||||
ar.unpack(&self.pgdata)?;
|
||||
};
|
||||
|
||||
// Report metrics
|
||||
let mut state = self.state.lock().unwrap();
|
||||
|
||||
@@ -73,19 +73,6 @@ pub fn write_postgres_conf(
|
||||
)?;
|
||||
}
|
||||
|
||||
// Locales
|
||||
if cfg!(target_os = "macos") {
|
||||
writeln!(file, "lc_messages='C'")?;
|
||||
writeln!(file, "lc_monetary='C'")?;
|
||||
writeln!(file, "lc_time='C'")?;
|
||||
writeln!(file, "lc_numeric='C'")?;
|
||||
} else {
|
||||
writeln!(file, "lc_messages='C.UTF-8'")?;
|
||||
writeln!(file, "lc_monetary='C.UTF-8'")?;
|
||||
writeln!(file, "lc_time='C.UTF-8'")?;
|
||||
writeln!(file, "lc_numeric='C.UTF-8'")?;
|
||||
}
|
||||
|
||||
match spec.mode {
|
||||
ComputeMode::Primary => {}
|
||||
ComputeMode::Static(lsn) => {
|
||||
|
||||
@@ -944,9 +944,6 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
|
||||
pg_auth_type: AuthType::Trust,
|
||||
http_auth_type: AuthType::Trust,
|
||||
other: Default::default(),
|
||||
// Typical developer machines use disks with slow fsync, and we don't care
|
||||
// about data integrity: disable disk syncs.
|
||||
no_sync: true,
|
||||
}
|
||||
})
|
||||
.collect(),
|
||||
|
||||
@@ -225,7 +225,6 @@ pub struct PageServerConf {
|
||||
pub listen_http_addr: String,
|
||||
pub pg_auth_type: AuthType,
|
||||
pub http_auth_type: AuthType,
|
||||
pub no_sync: bool,
|
||||
}
|
||||
|
||||
impl Default for PageServerConf {
|
||||
@@ -236,7 +235,6 @@ impl Default for PageServerConf {
|
||||
listen_http_addr: String::new(),
|
||||
pg_auth_type: AuthType::Trust,
|
||||
http_auth_type: AuthType::Trust,
|
||||
no_sync: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -251,8 +249,6 @@ pub struct NeonLocalInitPageserverConf {
|
||||
pub listen_http_addr: String,
|
||||
pub pg_auth_type: AuthType,
|
||||
pub http_auth_type: AuthType,
|
||||
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
|
||||
pub no_sync: bool,
|
||||
#[serde(flatten)]
|
||||
pub other: HashMap<String, toml::Value>,
|
||||
}
|
||||
@@ -265,7 +261,6 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
|
||||
listen_http_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
no_sync,
|
||||
other: _,
|
||||
} = conf;
|
||||
Self {
|
||||
@@ -274,7 +269,6 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
|
||||
listen_http_addr: listen_http_addr.clone(),
|
||||
pg_auth_type: *pg_auth_type,
|
||||
http_auth_type: *http_auth_type,
|
||||
no_sync: *no_sync,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -575,8 +569,6 @@ impl LocalEnv {
|
||||
listen_http_addr: String,
|
||||
pg_auth_type: AuthType,
|
||||
http_auth_type: AuthType,
|
||||
#[serde(default)]
|
||||
no_sync: bool,
|
||||
}
|
||||
let config_toml_path = dentry.path().join("pageserver.toml");
|
||||
let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str(
|
||||
@@ -599,7 +591,6 @@ impl LocalEnv {
|
||||
listen_http_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
no_sync,
|
||||
} = config_toml;
|
||||
let IdentityTomlSubset {
|
||||
id: identity_toml_id,
|
||||
@@ -616,7 +607,6 @@ impl LocalEnv {
|
||||
listen_http_addr,
|
||||
pg_auth_type,
|
||||
http_auth_type,
|
||||
no_sync,
|
||||
};
|
||||
pageservers.push(conf);
|
||||
}
|
||||
|
||||
@@ -17,7 +17,7 @@ use std::time::Duration;
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::models::{self, TenantInfo, TimelineInfo};
|
||||
use pageserver_api::models::{self, AuxFilePolicy, TenantInfo, TimelineInfo};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_client::mgmt_api;
|
||||
use postgres_backend::AuthType;
|
||||
@@ -273,7 +273,6 @@ impl PageServerNode {
|
||||
)
|
||||
})?;
|
||||
let args = vec!["-D", datadir_path_str];
|
||||
|
||||
background_process::start_process(
|
||||
"pageserver",
|
||||
&datadir,
|
||||
@@ -335,20 +334,17 @@ impl PageServerNode {
|
||||
checkpoint_distance: settings
|
||||
.remove("checkpoint_distance")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'checkpoint_distance' as an integer")?,
|
||||
.transpose()?,
|
||||
checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()),
|
||||
compaction_target_size: settings
|
||||
.remove("compaction_target_size")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_target_size' as an integer")?,
|
||||
.transpose()?,
|
||||
compaction_period: settings.remove("compaction_period").map(|x| x.to_string()),
|
||||
compaction_threshold: settings
|
||||
.remove("compaction_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_threshold' as an integer")?,
|
||||
.transpose()?,
|
||||
compaction_algorithm: settings
|
||||
.remove("compaction_algorithm")
|
||||
.map(serde_json::from_str)
|
||||
@@ -357,19 +353,16 @@ impl PageServerNode {
|
||||
gc_horizon: settings
|
||||
.remove("gc_horizon")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_horizon' as an integer")?,
|
||||
.transpose()?,
|
||||
gc_period: settings.remove("gc_period").map(|x| x.to_string()),
|
||||
image_creation_threshold: settings
|
||||
.remove("image_creation_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
|
||||
.transpose()?,
|
||||
image_layer_creation_check_threshold: settings
|
||||
.remove("image_layer_creation_check_threshold")
|
||||
.map(|x| x.parse::<u8>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'image_creation_check_threshold' as integer")?,
|
||||
.transpose()?,
|
||||
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
||||
walreceiver_connect_timeout: settings
|
||||
.remove("walreceiver_connect_timeout")
|
||||
@@ -406,15 +399,15 @@ impl PageServerNode {
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `timeline_get_throttle` from json")?,
|
||||
switch_aux_file_policy: settings
|
||||
.remove("switch_aux_file_policy")
|
||||
.map(|x| x.parse::<AuxFilePolicy>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'switch_aux_file_policy'")?,
|
||||
lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
|
||||
lsn_lease_length_for_ts: settings
|
||||
.remove("lsn_lease_length_for_ts")
|
||||
.map(|x| x.to_string()),
|
||||
timeline_offloading: settings
|
||||
.remove("timeline_offloading")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'timeline_offloading' as bool")?,
|
||||
};
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
@@ -426,9 +419,102 @@ impl PageServerNode {
|
||||
pub async fn tenant_config(
|
||||
&self,
|
||||
tenant_id: TenantId,
|
||||
settings: HashMap<&str, &str>,
|
||||
mut settings: HashMap<&str, &str>,
|
||||
) -> anyhow::Result<()> {
|
||||
let config = Self::parse_config(settings)?;
|
||||
let config = {
|
||||
// Braces to make the diff easier to read
|
||||
models::TenantConfig {
|
||||
checkpoint_distance: settings
|
||||
.remove("checkpoint_distance")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'checkpoint_distance' as an integer")?,
|
||||
checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()),
|
||||
compaction_target_size: settings
|
||||
.remove("compaction_target_size")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_target_size' as an integer")?,
|
||||
compaction_period: settings.remove("compaction_period").map(|x| x.to_string()),
|
||||
compaction_threshold: settings
|
||||
.remove("compaction_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_threshold' as an integer")?,
|
||||
compaction_algorithm: settings
|
||||
.remove("compactin_algorithm")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("Failed to parse 'compaction_algorithm' json")?,
|
||||
gc_horizon: settings
|
||||
.remove("gc_horizon")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'gc_horizon' as an integer")?,
|
||||
gc_period: settings.remove("gc_period").map(|x| x.to_string()),
|
||||
image_creation_threshold: settings
|
||||
.remove("image_creation_threshold")
|
||||
.map(|x| x.parse::<usize>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'image_creation_threshold' as non zero integer")?,
|
||||
image_layer_creation_check_threshold: settings
|
||||
.remove("image_layer_creation_check_threshold")
|
||||
.map(|x| x.parse::<u8>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'image_creation_check_threshold' as integer")?,
|
||||
|
||||
pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
|
||||
walreceiver_connect_timeout: settings
|
||||
.remove("walreceiver_connect_timeout")
|
||||
.map(|x| x.to_string()),
|
||||
lagging_wal_timeout: settings
|
||||
.remove("lagging_wal_timeout")
|
||||
.map(|x| x.to_string()),
|
||||
max_lsn_wal_lag: settings
|
||||
.remove("max_lsn_wal_lag")
|
||||
.map(|x| x.parse::<NonZeroU64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
|
||||
eviction_policy: settings
|
||||
.remove("eviction_policy")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("Failed to parse 'eviction_policy' json")?,
|
||||
min_resident_size_override: settings
|
||||
.remove("min_resident_size_override")
|
||||
.map(|x| x.parse::<u64>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'min_resident_size_override' as an integer")?,
|
||||
evictions_low_residence_duration_metric_threshold: settings
|
||||
.remove("evictions_low_residence_duration_metric_threshold")
|
||||
.map(|x| x.to_string()),
|
||||
heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
|
||||
lazy_slru_download: settings
|
||||
.remove("lazy_slru_download")
|
||||
.map(|x| x.parse::<bool>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'lazy_slru_download' as bool")?,
|
||||
timeline_get_throttle: settings
|
||||
.remove("timeline_get_throttle")
|
||||
.map(serde_json::from_str)
|
||||
.transpose()
|
||||
.context("parse `timeline_get_throttle` from json")?,
|
||||
switch_aux_file_policy: settings
|
||||
.remove("switch_aux_file_policy")
|
||||
.map(|x| x.parse::<AuxFilePolicy>())
|
||||
.transpose()
|
||||
.context("Failed to parse 'switch_aux_file_policy'")?,
|
||||
lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
|
||||
lsn_lease_length_for_ts: settings
|
||||
.remove("lsn_lease_length_for_ts")
|
||||
.map(|x| x.to_string()),
|
||||
}
|
||||
};
|
||||
|
||||
if !settings.is_empty() {
|
||||
bail!("Unrecognized tenant settings: {settings:?}")
|
||||
}
|
||||
|
||||
self.http_client
|
||||
.tenant_config(&models::TenantConfigRequest { tenant_id, config })
|
||||
.await?;
|
||||
|
||||
@@ -91,7 +91,7 @@ generating the basebackup by scanning the `REPL_ORIGIN_KEY_PREFIX` keyspace.
|
||||
There are two places we need to read the aux files from the pageserver:
|
||||
|
||||
* On the write path, when the compute node adds an aux file to the pageserver, we will retrieve the key from the storage, append the file to the hashed key, and write it back. The current `get` API already supports that.
|
||||
* We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API used to always attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error. Furthermore, as aux file reads usually need all layer files intersecting with that key range within the branch and cover a big keyspace, it incurs large overhead for tracking keyspaces that have not been read. Therefore, for sparse keyspaces, we [do not track](https://github.com/neondatabase/neon/pull/9631) `ummapped_keyspace`.
|
||||
* We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API will attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error.
|
||||
|
||||
## Compaction and Image Layer Generation
|
||||
|
||||
|
||||
@@ -110,23 +110,6 @@ static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
|
||||
pub const DISK_FSYNC_SECONDS_BUCKETS: &[f64] =
|
||||
&[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 30.0];
|
||||
|
||||
/// Constructs histogram buckets that are powers of two starting at 1 (i.e. 2^0), covering the end
|
||||
/// points. For example, passing start=5,end=20 yields 4,8,16,32 as does start=4,end=32.
|
||||
pub fn pow2_buckets(start: usize, end: usize) -> Vec<f64> {
|
||||
assert_ne!(start, 0);
|
||||
assert!(start <= end);
|
||||
let start = match start.checked_next_power_of_two() {
|
||||
Some(n) if n == start => n, // start already power of two
|
||||
Some(n) => n >> 1, // power of two below start
|
||||
None => panic!("start too large"),
|
||||
};
|
||||
let end = end.checked_next_power_of_two().expect("end too large");
|
||||
std::iter::successors(Some(start), |n| n.checked_mul(2))
|
||||
.take_while(|n| n <= &end)
|
||||
.map(|n| n as f64)
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub struct BuildInfo {
|
||||
pub revision: &'static str,
|
||||
pub build_tag: &'static str,
|
||||
@@ -612,67 +595,3 @@ where
|
||||
self.dec.collect_into(metadata, labels, name, &mut enc.0)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
const POW2_BUCKETS_MAX: usize = 1 << (usize::BITS - 1);
|
||||
|
||||
#[test]
|
||||
fn pow2_buckets_cases() {
|
||||
assert_eq!(pow2_buckets(1, 1), vec![1.0]);
|
||||
assert_eq!(pow2_buckets(1, 2), vec![1.0, 2.0]);
|
||||
assert_eq!(pow2_buckets(1, 3), vec![1.0, 2.0, 4.0]);
|
||||
assert_eq!(pow2_buckets(1, 4), vec![1.0, 2.0, 4.0]);
|
||||
assert_eq!(pow2_buckets(1, 5), vec![1.0, 2.0, 4.0, 8.0]);
|
||||
assert_eq!(pow2_buckets(1, 6), vec![1.0, 2.0, 4.0, 8.0]);
|
||||
assert_eq!(pow2_buckets(1, 7), vec![1.0, 2.0, 4.0, 8.0]);
|
||||
assert_eq!(pow2_buckets(1, 8), vec![1.0, 2.0, 4.0, 8.0]);
|
||||
assert_eq!(
|
||||
pow2_buckets(1, 200),
|
||||
vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0]
|
||||
);
|
||||
|
||||
assert_eq!(pow2_buckets(1, 8), vec![1.0, 2.0, 4.0, 8.0]);
|
||||
assert_eq!(pow2_buckets(2, 8), vec![2.0, 4.0, 8.0]);
|
||||
assert_eq!(pow2_buckets(3, 8), vec![2.0, 4.0, 8.0]);
|
||||
assert_eq!(pow2_buckets(4, 8), vec![4.0, 8.0]);
|
||||
assert_eq!(pow2_buckets(5, 8), vec![4.0, 8.0]);
|
||||
assert_eq!(pow2_buckets(6, 8), vec![4.0, 8.0]);
|
||||
assert_eq!(pow2_buckets(7, 8), vec![4.0, 8.0]);
|
||||
assert_eq!(pow2_buckets(8, 8), vec![8.0]);
|
||||
assert_eq!(pow2_buckets(20, 200), vec![16.0, 32.0, 64.0, 128.0, 256.0]);
|
||||
|
||||
// Largest valid values.
|
||||
assert_eq!(
|
||||
pow2_buckets(1, POW2_BUCKETS_MAX).len(),
|
||||
usize::BITS as usize
|
||||
);
|
||||
assert_eq!(pow2_buckets(POW2_BUCKETS_MAX, POW2_BUCKETS_MAX).len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn pow2_buckets_zero_start() {
|
||||
pow2_buckets(0, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn pow2_buckets_end_lt_start() {
|
||||
pow2_buckets(2, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn pow2_buckets_end_overflow_min() {
|
||||
pow2_buckets(1, POW2_BUCKETS_MAX + 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn pow2_buckets_end_overflow_max() {
|
||||
pow2_buckets(1, usize::MAX);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -64,7 +64,6 @@ pub struct ConfigToml {
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub wal_redo_timeout: Duration,
|
||||
pub superuser: String,
|
||||
pub locale: String,
|
||||
pub page_cache_size: usize,
|
||||
pub max_file_descriptors: usize,
|
||||
pub pg_distrib_dir: Option<Utf8PathBuf>,
|
||||
@@ -107,8 +106,6 @@ pub struct ConfigToml {
|
||||
pub ephemeral_bytes_per_memory_kb: usize,
|
||||
pub l0_flush: Option<crate::models::L0FlushConfig>,
|
||||
pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub no_sync: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
|
||||
@@ -253,6 +250,12 @@ pub struct TenantConfigToml {
|
||||
// Expresed in multiples of checkpoint distance.
|
||||
pub image_layer_creation_check_threshold: u8,
|
||||
|
||||
/// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
|
||||
/// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
|
||||
/// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
|
||||
/// file is written.
|
||||
pub switch_aux_file_policy: crate::models::AuxFilePolicy,
|
||||
|
||||
/// The length for an explicit LSN lease request.
|
||||
/// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
|
||||
#[serde(with = "humantime_serde")]
|
||||
@@ -262,10 +265,6 @@ pub struct TenantConfigToml {
|
||||
/// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
|
||||
#[serde(with = "humantime_serde")]
|
||||
pub lsn_lease_length_for_ts: Duration,
|
||||
|
||||
/// Enable auto-offloading of timelines.
|
||||
/// (either this flag or the pageserver-global one need to be set)
|
||||
pub timeline_offloading: bool,
|
||||
}
|
||||
|
||||
pub mod defaults {
|
||||
@@ -277,11 +276,6 @@ pub mod defaults {
|
||||
pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
|
||||
|
||||
pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
|
||||
pub const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
|
||||
"C"
|
||||
} else {
|
||||
"C.UTF-8"
|
||||
};
|
||||
|
||||
pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
|
||||
pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
|
||||
@@ -332,7 +326,6 @@ impl Default for ConfigToml {
|
||||
wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
|
||||
.expect("cannot parse default wal redo timeout")),
|
||||
superuser: (DEFAULT_SUPERUSER.to_string()),
|
||||
locale: DEFAULT_LOCALE.to_string(),
|
||||
page_cache_size: (DEFAULT_PAGE_CACHE_SIZE),
|
||||
max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS),
|
||||
pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
|
||||
@@ -398,7 +391,6 @@ impl Default for ConfigToml {
|
||||
l0_flush: None,
|
||||
virtual_file_io_mode: None,
|
||||
tenant_config: TenantConfigToml::default(),
|
||||
no_sync: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -483,9 +475,9 @@ impl Default for TenantConfigToml {
|
||||
lazy_slru_download: false,
|
||||
timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
|
||||
image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
|
||||
switch_aux_file_policy: crate::models::AuxFilePolicy::default_tenant_config(),
|
||||
lsn_lease_length: LsnLease::DEFAULT_LENGTH,
|
||||
lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
|
||||
timeline_offloading: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,11 +5,9 @@ pub mod controller_api;
|
||||
pub mod key;
|
||||
pub mod keyspace;
|
||||
pub mod models;
|
||||
pub mod record;
|
||||
pub mod reltag;
|
||||
pub mod shard;
|
||||
/// Public API types
|
||||
pub mod upcall_api;
|
||||
pub mod value;
|
||||
|
||||
pub mod config;
|
||||
|
||||
@@ -10,6 +10,7 @@ use std::{
|
||||
io::{BufRead, Read},
|
||||
num::{NonZeroU32, NonZeroU64, NonZeroUsize},
|
||||
str::FromStr,
|
||||
sync::atomic::AtomicUsize,
|
||||
time::{Duration, SystemTime},
|
||||
};
|
||||
|
||||
@@ -308,9 +309,9 @@ pub struct TenantConfig {
|
||||
pub lazy_slru_download: Option<bool>,
|
||||
pub timeline_get_throttle: Option<ThrottleConfig>,
|
||||
pub image_layer_creation_check_threshold: Option<u8>,
|
||||
pub switch_aux_file_policy: Option<AuxFilePolicy>,
|
||||
pub lsn_lease_length: Option<String>,
|
||||
pub lsn_lease_length_for_ts: Option<String>,
|
||||
pub timeline_offloading: Option<bool>,
|
||||
}
|
||||
|
||||
/// The policy for the aux file storage.
|
||||
@@ -349,6 +350,68 @@ pub enum AuxFilePolicy {
|
||||
CrossValidation,
|
||||
}
|
||||
|
||||
impl AuxFilePolicy {
|
||||
pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
|
||||
matches!(
|
||||
(from, to),
|
||||
(None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
|
||||
)
|
||||
}
|
||||
|
||||
/// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
|
||||
pub fn default_tenant_config() -> Self {
|
||||
Self::V2
|
||||
}
|
||||
}
|
||||
|
||||
/// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
|
||||
pub struct AtomicAuxFilePolicy(AtomicUsize);
|
||||
|
||||
impl AtomicAuxFilePolicy {
|
||||
pub fn new(policy: Option<AuxFilePolicy>) -> Self {
|
||||
Self(AtomicUsize::new(
|
||||
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
|
||||
))
|
||||
}
|
||||
|
||||
pub fn load(&self) -> Option<AuxFilePolicy> {
|
||||
match self.0.load(std::sync::atomic::Ordering::Acquire) {
|
||||
0 => None,
|
||||
other => Some(AuxFilePolicy::from_usize(other)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn store(&self, policy: Option<AuxFilePolicy>) {
|
||||
self.0.store(
|
||||
policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
|
||||
std::sync::atomic::Ordering::Release,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
impl AuxFilePolicy {
|
||||
pub fn to_usize(self) -> usize {
|
||||
match self {
|
||||
Self::V1 => 1,
|
||||
Self::CrossValidation => 2,
|
||||
Self::V2 => 3,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn try_from_usize(this: usize) -> Option<Self> {
|
||||
match this {
|
||||
1 => Some(Self::V1),
|
||||
2 => Some(Self::CrossValidation),
|
||||
3 => Some(Self::V2),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_usize(this: usize) -> Self {
|
||||
Self::try_from_usize(this).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(tag = "kind")]
|
||||
pub enum EvictionPolicy {
|
||||
@@ -1570,6 +1633,71 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aux_file_migration_path() {
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
None,
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
None,
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
None,
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
// Self-migration is not a valid migration path, and the caller should handle it by itself.
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V1),
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V2),
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
// Migrations not allowed
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V1),
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V2),
|
||||
AuxFilePolicy::V1
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V2),
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
assert!(!AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::V1),
|
||||
AuxFilePolicy::CrossValidation
|
||||
));
|
||||
// Migrations allowed
|
||||
assert!(AuxFilePolicy::is_valid_migration_path(
|
||||
Some(AuxFilePolicy::CrossValidation),
|
||||
AuxFilePolicy::V2
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aux_parse() {
|
||||
assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
|
||||
assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
|
||||
assert_eq!(
|
||||
AuxFilePolicy::from_str("cross-validation").unwrap(),
|
||||
AuxFilePolicy::CrossValidation
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_image_compression_algorithm_parsing() {
|
||||
use ImageCompressionAlgorithm::*;
|
||||
|
||||
@@ -1,113 +0,0 @@
|
||||
//! This module defines the WAL record format used within the pageserver.
|
||||
|
||||
use bytes::Bytes;
|
||||
use postgres_ffi::walrecord::{describe_postgres_wal_record, MultiXactMember};
|
||||
use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
|
||||
/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper
|
||||
/// around a PostgreSQL WAL record, or a custom neon-specific "record".
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub enum NeonWalRecord {
|
||||
/// Native PostgreSQL WAL record
|
||||
Postgres { will_init: bool, rec: Bytes },
|
||||
|
||||
/// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
|
||||
ClearVisibilityMapFlags {
|
||||
new_heap_blkno: Option<u32>,
|
||||
old_heap_blkno: Option<u32>,
|
||||
flags: u8,
|
||||
},
|
||||
/// Mark transaction IDs as committed on a CLOG page
|
||||
ClogSetCommitted {
|
||||
xids: Vec<TransactionId>,
|
||||
timestamp: TimestampTz,
|
||||
},
|
||||
/// Mark transaction IDs as aborted on a CLOG page
|
||||
ClogSetAborted { xids: Vec<TransactionId> },
|
||||
/// Extend multixact offsets SLRU
|
||||
MultixactOffsetCreate {
|
||||
mid: MultiXactId,
|
||||
moff: MultiXactOffset,
|
||||
},
|
||||
/// Extend multixact members SLRU.
|
||||
MultixactMembersCreate {
|
||||
moff: MultiXactOffset,
|
||||
members: Vec<MultiXactMember>,
|
||||
},
|
||||
/// Update the map of AUX files, either writing or dropping an entry
|
||||
AuxFile {
|
||||
file_path: String,
|
||||
content: Option<Bytes>,
|
||||
},
|
||||
|
||||
/// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
|
||||
#[cfg(feature = "testing")]
|
||||
Test {
|
||||
/// Append a string to the image.
|
||||
append: String,
|
||||
/// Clear the image before appending.
|
||||
clear: bool,
|
||||
/// Treat this record as an init record. `clear` should be set to true if this field is set
|
||||
/// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
|
||||
/// its references in `timeline.rs`.
|
||||
will_init: bool,
|
||||
},
|
||||
}
|
||||
|
||||
impl NeonWalRecord {
|
||||
/// Does replaying this WAL record initialize the page from scratch, or does
|
||||
/// it need to be applied over the previous image of the page?
|
||||
pub fn will_init(&self) -> bool {
|
||||
// If you change this function, you'll also need to change ValueBytes::will_init
|
||||
match self {
|
||||
NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
|
||||
#[cfg(feature = "testing")]
|
||||
NeonWalRecord::Test { will_init, .. } => *will_init,
|
||||
// None of the special neon record types currently initialize the page
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn wal_append(s: impl AsRef<str>) -> Self {
|
||||
Self::Test {
|
||||
append: s.as_ref().to_string(),
|
||||
clear: false,
|
||||
will_init: false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn wal_clear() -> Self {
|
||||
Self::Test {
|
||||
append: "".to_string(),
|
||||
clear: true,
|
||||
will_init: false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub fn wal_init() -> Self {
|
||||
Self::Test {
|
||||
append: "".to_string(),
|
||||
clear: true,
|
||||
will_init: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a human-readable string to describe a WAL record
|
||||
///
|
||||
/// For debugging purposes
|
||||
pub fn describe_wal_record(rec: &NeonWalRecord) -> Result<String, DeserializeError> {
|
||||
match rec {
|
||||
NeonWalRecord::Postgres { will_init, rec } => Ok(format!(
|
||||
"will_init: {}, {}",
|
||||
will_init,
|
||||
describe_postgres_wal_record(rec)?
|
||||
)),
|
||||
_ => Ok(format!("{:?}", rec)),
|
||||
}
|
||||
}
|
||||
@@ -2,7 +2,7 @@
|
||||
use once_cell::sync::Lazy;
|
||||
use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
|
||||
use pq_proto::{BeMessage, RowDescriptor};
|
||||
use rustls::crypto::ring;
|
||||
use rustls::crypto::aws_lc_rs;
|
||||
use std::io::Cursor;
|
||||
use std::sync::Arc;
|
||||
use tokio::io::{AsyncRead, AsyncWrite};
|
||||
@@ -94,7 +94,7 @@ async fn simple_select_ssl() {
|
||||
let (client_sock, server_sock) = make_tcp_pair().await;
|
||||
|
||||
let server_cfg =
|
||||
rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
|
||||
rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
|
||||
.with_safe_default_protocol_versions()
|
||||
.expect("aws_lc_rs should support the default protocol versions")
|
||||
.with_no_client_auth()
|
||||
@@ -110,7 +110,7 @@ async fn simple_select_ssl() {
|
||||
});
|
||||
|
||||
let client_cfg =
|
||||
rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
|
||||
rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
|
||||
.with_safe_default_protocol_versions()
|
||||
.expect("aws_lc_rs should support the default protocol versions")
|
||||
.with_root_certificates({
|
||||
|
||||
@@ -15,7 +15,6 @@ memoffset.workspace = true
|
||||
thiserror.workspace = true
|
||||
serde.workspace = true
|
||||
utils.workspace = true
|
||||
tracing.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
env_logger.workspace = true
|
||||
|
||||
@@ -36,7 +36,6 @@ macro_rules! postgres_ffi {
|
||||
pub mod controlfile_utils;
|
||||
pub mod nonrelfile_utils;
|
||||
pub mod wal_craft_test_export;
|
||||
pub mod wal_generator;
|
||||
pub mod waldecoder_handler;
|
||||
pub mod xlog_utils;
|
||||
|
||||
@@ -218,7 +217,6 @@ macro_rules! enum_pgversion {
|
||||
|
||||
pub mod pg_constants;
|
||||
pub mod relfile_utils;
|
||||
pub mod walrecord;
|
||||
|
||||
// Export some widely used datatypes that are unlikely to change across Postgres versions
|
||||
pub use v14::bindings::RepOriginId;
|
||||
@@ -241,7 +239,7 @@ pub use v14::bindings::{CheckPoint, ControlFileData};
|
||||
pub const BLCKSZ: u16 = 8192;
|
||||
pub const RELSEG_SIZE: u32 = 1024 * 1024 * 1024 / (BLCKSZ as u32);
|
||||
pub const XLOG_BLCKSZ: usize = 8192;
|
||||
pub const WAL_SEGMENT_SIZE: usize = 128 * 1024 * 1024;
|
||||
pub const WAL_SEGMENT_SIZE: usize = 16 * 1024 * 1024;
|
||||
|
||||
pub const MAX_SEND_SIZE: usize = XLOG_BLCKSZ * 16;
|
||||
|
||||
|
||||
@@ -1,259 +0,0 @@
|
||||
use std::ffi::{CStr, CString};
|
||||
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use crc32c::crc32c_append;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::bindings::{RmgrId, XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC};
|
||||
use super::xlog_utils::{
|
||||
XlLogicalMessage, XLOG_RECORD_CRC_OFFS, XLOG_SIZE_OF_XLOG_RECORD, XLP_BKP_REMOVABLE,
|
||||
XLP_FIRST_IS_CONTRECORD,
|
||||
};
|
||||
use super::XLogRecord;
|
||||
use crate::pg_constants::{
|
||||
RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE, XLP_LONG_HEADER, XLR_BLOCK_ID_DATA_LONG,
|
||||
XLR_BLOCK_ID_DATA_SHORT,
|
||||
};
|
||||
use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
|
||||
|
||||
/// A WAL record payload. Will be prefixed by an XLogRecord header when encoded.
|
||||
pub struct Record {
|
||||
pub rmid: RmgrId,
|
||||
pub info: u8,
|
||||
pub data: Bytes,
|
||||
}
|
||||
|
||||
impl Record {
|
||||
/// Encodes the WAL record including an XLogRecord header. prev_lsn is the start position of
|
||||
/// the previous record in the WAL -- this is ignored by the Safekeeper, but not Postgres.
|
||||
pub fn encode(&self, prev_lsn: Lsn) -> Bytes {
|
||||
// Prefix data with block ID and length.
|
||||
let data_header = Bytes::from(match self.data.len() {
|
||||
0 => vec![],
|
||||
1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, self.data.len() as u8],
|
||||
256.. => {
|
||||
let len_bytes = (self.data.len() as u32).to_le_bytes();
|
||||
[&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat()
|
||||
}
|
||||
});
|
||||
|
||||
// Construct the WAL record header.
|
||||
let mut header = XLogRecord {
|
||||
xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + self.data.len()) as u32,
|
||||
xl_xid: 0,
|
||||
xl_prev: prev_lsn.into(),
|
||||
xl_info: self.info,
|
||||
xl_rmid: self.rmid,
|
||||
__bindgen_padding_0: [0; 2],
|
||||
xl_crc: 0, // see below
|
||||
};
|
||||
|
||||
// Compute the CRC checksum for the data, and the header up to the CRC field.
|
||||
let mut crc = 0;
|
||||
crc = crc32c_append(crc, &data_header);
|
||||
crc = crc32c_append(crc, &self.data);
|
||||
crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]);
|
||||
header.xl_crc = crc;
|
||||
|
||||
// Encode the final header and record.
|
||||
let header = header.encode().unwrap();
|
||||
|
||||
[header, data_header, self.data.clone()].concat().into()
|
||||
}
|
||||
}
|
||||
|
||||
/// Generates WAL record payloads.
|
||||
///
|
||||
/// TODO: currently only provides LogicalMessageGenerator for trivial noop messages. Add a generator
|
||||
/// that creates a table and inserts rows.
|
||||
pub trait RecordGenerator: Iterator<Item = Record> {}
|
||||
|
||||
impl<I: Iterator<Item = Record>> RecordGenerator for I {}
|
||||
|
||||
/// Generates binary WAL for use in tests and benchmarks. The provided record generator constructs
|
||||
/// the WAL records. It is used as an iterator which yields encoded bytes for a single WAL record,
|
||||
/// including internal page headers if it spans pages. Concatenating the bytes will yield a
|
||||
/// complete, well-formed WAL, which can be chunked at segment boundaries if desired. Not optimized
|
||||
/// for performance.
|
||||
///
|
||||
/// The WAL format is version-dependant (see e.g. `XLOG_PAGE_MAGIC`), so make sure to import this
|
||||
/// for the appropriate Postgres version (e.g. `postgres_ffi::v17::wal_generator::WalGenerator`).
|
||||
///
|
||||
/// A WAL is split into 16 MB segments. Each segment is split into 8 KB pages, with headers.
|
||||
/// Records are arbitrary length, 8-byte aligned, and may span pages. The layout is e.g.:
|
||||
///
|
||||
/// | Segment 1 | Segment 2 | Segment 3 |
|
||||
/// | Page 1 | Page 2 | Page 3 | Page 4 | Page 5 | Page 6 | Page 7 | Page 8 | Page 9 |
|
||||
/// | R1 | R2 |R3| R4 | R5 | R6 | R7 | R8 |
|
||||
#[derive(Default)]
|
||||
pub struct WalGenerator<R: RecordGenerator> {
|
||||
/// Generates record payloads for the WAL.
|
||||
pub record_generator: R,
|
||||
/// Current LSN to append the next record at.
|
||||
///
|
||||
/// Callers can modify this (and prev_lsn) to restart generation at a different LSN, but should
|
||||
/// ensure that the LSN is on a valid record boundary (i.e. we can't start appending in the
|
||||
/// middle on an existing record or header, or beyond the end of the existing WAL).
|
||||
pub lsn: Lsn,
|
||||
/// The starting LSN of the previous record. Used in WAL record headers. The Safekeeper doesn't
|
||||
/// care about this, unlike Postgres, but we include it for completeness.
|
||||
pub prev_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl<R: RecordGenerator> WalGenerator<R> {
|
||||
// Hardcode the sys and timeline ID. We can make them configurable if we care about them.
|
||||
const SYS_ID: u64 = 0;
|
||||
const TIMELINE_ID: u32 = 1;
|
||||
|
||||
/// Creates a new WAL generator with the given record generator.
|
||||
pub fn new(record_generator: R) -> WalGenerator<R> {
|
||||
Self {
|
||||
record_generator,
|
||||
lsn: Lsn(0),
|
||||
prev_lsn: Lsn(0),
|
||||
}
|
||||
}
|
||||
|
||||
/// Appends a record with an arbitrary payload at the current LSN, then increments the LSN.
|
||||
/// Returns the WAL bytes for the record, including page headers and padding, and the start LSN.
|
||||
fn append_record(&mut self, record: Record) -> (Lsn, Bytes) {
|
||||
let record = record.encode(self.prev_lsn);
|
||||
let record = Self::insert_pages(record, self.lsn);
|
||||
let record = Self::pad_record(record, self.lsn);
|
||||
let lsn = self.lsn;
|
||||
self.prev_lsn = self.lsn;
|
||||
self.lsn += record.len() as u64;
|
||||
(lsn, record)
|
||||
}
|
||||
|
||||
/// Inserts page headers on 8KB page boundaries. Takes the current LSN position where the record
|
||||
/// is to be appended.
|
||||
fn insert_pages(record: Bytes, mut lsn: Lsn) -> Bytes {
|
||||
// Fast path: record fits in current page, and the page already has a header.
|
||||
if lsn.remaining_in_block() as usize >= record.len() && lsn.block_offset() > 0 {
|
||||
return record;
|
||||
}
|
||||
|
||||
let mut pages = BytesMut::new();
|
||||
let mut remaining = record.clone(); // Bytes::clone() is cheap
|
||||
while !remaining.is_empty() {
|
||||
// At new page boundary, inject page header.
|
||||
if lsn.block_offset() == 0 {
|
||||
let mut page_header = XLogPageHeaderData {
|
||||
xlp_magic: XLOG_PAGE_MAGIC as u16,
|
||||
xlp_info: XLP_BKP_REMOVABLE,
|
||||
xlp_tli: Self::TIMELINE_ID,
|
||||
xlp_pageaddr: lsn.0,
|
||||
xlp_rem_len: 0,
|
||||
__bindgen_padding_0: [0; 4],
|
||||
};
|
||||
// If the record was split across page boundaries, mark as continuation.
|
||||
if remaining.len() < record.len() {
|
||||
page_header.xlp_rem_len = remaining.len() as u32;
|
||||
page_header.xlp_info |= XLP_FIRST_IS_CONTRECORD;
|
||||
}
|
||||
// At start of segment, use a long page header.
|
||||
let page_header = if lsn.segment_offset(WAL_SEGMENT_SIZE) == 0 {
|
||||
page_header.xlp_info |= XLP_LONG_HEADER;
|
||||
XLogLongPageHeaderData {
|
||||
std: page_header,
|
||||
xlp_sysid: Self::SYS_ID,
|
||||
xlp_seg_size: WAL_SEGMENT_SIZE as u32,
|
||||
xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
|
||||
}
|
||||
.encode()
|
||||
.unwrap()
|
||||
} else {
|
||||
page_header.encode().unwrap()
|
||||
};
|
||||
pages.extend_from_slice(&page_header);
|
||||
lsn += page_header.len() as u64;
|
||||
}
|
||||
|
||||
// Append the record up to the next page boundary, if any.
|
||||
let page_free = lsn.remaining_in_block() as usize;
|
||||
let chunk = remaining.split_to(std::cmp::min(page_free, remaining.len()));
|
||||
pages.extend_from_slice(&chunk);
|
||||
lsn += chunk.len() as u64;
|
||||
}
|
||||
pages.freeze()
|
||||
}
|
||||
|
||||
/// Records must be 8-byte aligned. Take an encoded record (including any injected page
|
||||
/// boundaries), starting at the given LSN, and add any necessary padding at the end.
|
||||
fn pad_record(record: Bytes, mut lsn: Lsn) -> Bytes {
|
||||
lsn += record.len() as u64;
|
||||
let padding = lsn.calc_padding(8u64) as usize;
|
||||
if padding == 0 {
|
||||
return record;
|
||||
}
|
||||
[record, Bytes::from(vec![0; padding])].concat().into()
|
||||
}
|
||||
}
|
||||
|
||||
/// Generates WAL records as an iterator.
|
||||
impl<R: RecordGenerator> Iterator for WalGenerator<R> {
|
||||
type Item = (Lsn, Bytes);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let record = self.record_generator.next()?;
|
||||
Some(self.append_record(record))
|
||||
}
|
||||
}
|
||||
|
||||
/// Generates logical message records (effectively noops) with a fixed message.
|
||||
pub struct LogicalMessageGenerator {
|
||||
prefix: CString,
|
||||
message: Vec<u8>,
|
||||
}
|
||||
|
||||
impl LogicalMessageGenerator {
|
||||
const DB_ID: u32 = 0; // hardcoded for now
|
||||
const RM_ID: RmgrId = RM_LOGICALMSG_ID;
|
||||
const INFO: u8 = XLOG_LOGICAL_MESSAGE;
|
||||
|
||||
/// Creates a new LogicalMessageGenerator.
|
||||
pub fn new(prefix: &CStr, message: &[u8]) -> Self {
|
||||
Self {
|
||||
prefix: prefix.to_owned(),
|
||||
message: message.to_owned(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Encodes a logical message.
|
||||
fn encode(prefix: &CStr, message: &[u8]) -> Bytes {
|
||||
let prefix = prefix.to_bytes_with_nul();
|
||||
let header = XlLogicalMessage {
|
||||
db_id: Self::DB_ID,
|
||||
transactional: 0,
|
||||
prefix_size: prefix.len() as u64,
|
||||
message_size: message.len() as u64,
|
||||
};
|
||||
[&header.encode(), prefix, message].concat().into()
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for LogicalMessageGenerator {
|
||||
type Item = Record;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
Some(Record {
|
||||
rmid: Self::RM_ID,
|
||||
info: Self::INFO,
|
||||
data: Self::encode(&self.prefix, &self.message),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl WalGenerator<LogicalMessageGenerator> {
|
||||
/// Convenience method for appending a WAL record with an arbitrary logical message at the
|
||||
/// current WAL LSN position. Returns the start LSN and resulting WAL bytes.
|
||||
pub fn append_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> (Lsn, Bytes) {
|
||||
let record = Record {
|
||||
rmid: LogicalMessageGenerator::RM_ID,
|
||||
info: LogicalMessageGenerator::INFO,
|
||||
data: LogicalMessageGenerator::encode(prefix, message),
|
||||
};
|
||||
self.append_record(record)
|
||||
}
|
||||
}
|
||||
@@ -7,12 +7,13 @@
|
||||
// have been named the same as the corresponding PostgreSQL functions instead.
|
||||
//
|
||||
|
||||
use crc32c::crc32c_append;
|
||||
|
||||
use super::super::waldecoder::WalStreamDecoder;
|
||||
use super::bindings::{
|
||||
CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz,
|
||||
XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
|
||||
};
|
||||
use super::wal_generator::LogicalMessageGenerator;
|
||||
use super::PG_MAJORVERSION;
|
||||
use crate::pg_constants;
|
||||
use crate::PG_TLI;
|
||||
@@ -25,7 +26,7 @@ use bytes::{Buf, Bytes};
|
||||
use log::*;
|
||||
|
||||
use serde::Serialize;
|
||||
use std::ffi::{CString, OsStr};
|
||||
use std::ffi::OsStr;
|
||||
use std::fs::File;
|
||||
use std::io::prelude::*;
|
||||
use std::io::ErrorKind;
|
||||
@@ -38,7 +39,6 @@ use utils::bin_ser::SerializeError;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
pub const XLOG_FNAME_LEN: usize = 24;
|
||||
pub const XLP_BKP_REMOVABLE: u16 = 0x0004;
|
||||
pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
|
||||
pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8;
|
||||
pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;
|
||||
@@ -489,14 +489,64 @@ impl XlLogicalMessage {
|
||||
/// Create new WAL record for non-transactional logical message.
|
||||
/// Used for creating artificial WAL for tests, as LogicalMessage
|
||||
/// record is basically no-op.
|
||||
pub fn encode_logical_message(prefix: &str, message: &str) -> Bytes {
|
||||
// This function can take untrusted input, so discard any NUL bytes in the prefix string.
|
||||
let prefix = CString::new(prefix.replace('\0', "")).expect("no NULs");
|
||||
let message = message.as_bytes();
|
||||
LogicalMessageGenerator::new(&prefix, message)
|
||||
.next()
|
||||
.unwrap()
|
||||
.encode(Lsn(0))
|
||||
///
|
||||
/// NOTE: This leaves the xl_prev field zero. The safekeeper and
|
||||
/// pageserver tolerate that, but PostgreSQL does not.
|
||||
pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
|
||||
let mut prefix_bytes: Vec<u8> = Vec::with_capacity(prefix.len() + 1);
|
||||
prefix_bytes.write_all(prefix.as_bytes()).unwrap();
|
||||
prefix_bytes.push(0);
|
||||
|
||||
let message_bytes = message.as_bytes();
|
||||
|
||||
let logical_message = XlLogicalMessage {
|
||||
db_id: 0,
|
||||
transactional: 0,
|
||||
prefix_size: prefix_bytes.len() as u64,
|
||||
message_size: message_bytes.len() as u64,
|
||||
};
|
||||
|
||||
let mainrdata = logical_message.encode();
|
||||
let mainrdata_len: usize = mainrdata.len() + prefix_bytes.len() + message_bytes.len();
|
||||
// only short mainrdata is supported for now
|
||||
assert!(mainrdata_len <= 255);
|
||||
let mainrdata_len = mainrdata_len as u8;
|
||||
|
||||
let mut data: Vec<u8> = vec![pg_constants::XLR_BLOCK_ID_DATA_SHORT, mainrdata_len];
|
||||
data.extend_from_slice(&mainrdata);
|
||||
data.extend_from_slice(&prefix_bytes);
|
||||
data.extend_from_slice(message_bytes);
|
||||
|
||||
let total_len = XLOG_SIZE_OF_XLOG_RECORD + data.len();
|
||||
|
||||
let mut header = XLogRecord {
|
||||
xl_tot_len: total_len as u32,
|
||||
xl_xid: 0,
|
||||
xl_prev: 0,
|
||||
xl_info: 0,
|
||||
xl_rmid: 21,
|
||||
__bindgen_padding_0: [0u8; 2usize],
|
||||
xl_crc: 0, // crc will be calculated later
|
||||
};
|
||||
|
||||
let header_bytes = header.encode().expect("failed to encode header");
|
||||
let crc = crc32c_append(0, &data);
|
||||
let crc = crc32c_append(crc, &header_bytes[0..XLOG_RECORD_CRC_OFFS]);
|
||||
header.xl_crc = crc;
|
||||
|
||||
let mut wal: Vec<u8> = Vec::new();
|
||||
wal.extend_from_slice(&header.encode().expect("failed to encode header"));
|
||||
wal.extend_from_slice(&data);
|
||||
|
||||
// WAL start position must be aligned at 8 bytes,
|
||||
// this will add padding for the next WAL record.
|
||||
const PADDING: usize = 8;
|
||||
let padding_rem = wal.len() % PADDING;
|
||||
if padding_rem != 0 {
|
||||
wal.resize(wal.len() + PADDING - padding_rem, 0);
|
||||
}
|
||||
|
||||
wal
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -151,7 +151,7 @@ fn check_end_of_wal(
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
const_assert!(WAL_SEGMENT_SIZE == 128 * 1024 * 1024);
|
||||
const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
|
||||
|
||||
#[test]
|
||||
pub fn test_find_end_of_wal_simple() {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/env bash
|
||||
#!/bin/bash
|
||||
|
||||
set -euxo pipefail
|
||||
|
||||
@@ -6,44 +6,9 @@ PG_BIN=$1
|
||||
WAL_PATH=$2
|
||||
DATA_DIR=$3
|
||||
PORT=$4
|
||||
PG_VERSION=$5
|
||||
SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-)
|
||||
|
||||
# The way that initdb is invoked must match how the pageserver runs initdb.
|
||||
function initdb_with_args {
|
||||
local cmd=(
|
||||
"$PG_BIN"/initdb
|
||||
-E utf8
|
||||
-U cloud_admin
|
||||
-D "$DATA_DIR"
|
||||
--locale 'C.UTF-8'
|
||||
--lc-collate 'C.UTF-8'
|
||||
--lc-ctype 'C.UTF-8'
|
||||
--lc-messages 'C.UTF-8'
|
||||
--lc-monetary 'C.UTF-8'
|
||||
--lc-numeric 'C.UTF-8'
|
||||
--lc-time 'C.UTF-8'
|
||||
--sysid="$SYSID"
|
||||
)
|
||||
|
||||
case "$PG_VERSION" in
|
||||
14)
|
||||
# Postgres 14 and below didn't support --locale-provider
|
||||
;;
|
||||
15 | 16)
|
||||
cmd+=(--locale-provider 'libc')
|
||||
;;
|
||||
*)
|
||||
# Postgres 17 added the builtin provider
|
||||
cmd+=(--locale-provider 'builtin')
|
||||
;;
|
||||
esac
|
||||
|
||||
eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "${cmd[*]}"
|
||||
}
|
||||
|
||||
rm -fr "$DATA_DIR"
|
||||
initdb_with_args
|
||||
env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID"
|
||||
echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
|
||||
echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
|
||||
REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
|
||||
|
||||
@@ -40,11 +40,6 @@ pub enum Scope {
|
||||
/// Allows access to storage controller APIs used by the scrubber, to interrogate the state
|
||||
/// of a tenant & post scrub results.
|
||||
Scrubber,
|
||||
|
||||
/// This scope is used for communication with other storage controller instances.
|
||||
/// At the time of writing, this is only used for the step down request.
|
||||
#[serde(rename = "controller_peer")]
|
||||
ControllerPeer,
|
||||
}
|
||||
|
||||
/// JWT payload. See docs/authentication.md for the format
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::seqwait::MonotonicCounter;
|
||||
pub const XLOG_BLCKSZ: u32 = 8192;
|
||||
|
||||
/// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
|
||||
#[derive(Clone, Copy, Default, Eq, Ord, PartialEq, PartialOrd, Hash)]
|
||||
#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)]
|
||||
pub struct Lsn(pub u64);
|
||||
|
||||
impl Serialize for Lsn {
|
||||
@@ -138,11 +138,6 @@ impl Lsn {
|
||||
self.0.checked_sub(other).map(Lsn)
|
||||
}
|
||||
|
||||
/// Subtract a number, saturating at numeric bounds instead of overflowing.
|
||||
pub fn saturating_sub<T: Into<u64>>(self, other: T) -> Lsn {
|
||||
Lsn(self.0.saturating_sub(other.into()))
|
||||
}
|
||||
|
||||
/// Subtract a number, returning the difference as i128 to avoid overflow.
|
||||
pub fn widening_sub<T: Into<u64>>(self, other: T) -> i128 {
|
||||
let other: u64 = other.into();
|
||||
|
||||
@@ -1,18 +0,0 @@
|
||||
[package]
|
||||
name = "wal_decoder"
|
||||
version = "0.1.0"
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[features]
|
||||
testing = ["pageserver_api/testing"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
bytes.workspace = true
|
||||
pageserver_api.workspace = true
|
||||
postgres_ffi.workspace = true
|
||||
serde.workspace = true
|
||||
tracing.workspace = true
|
||||
utils.workspace = true
|
||||
workspace_hack = { version = "0.1", path = "../../workspace_hack" }
|
||||
@@ -1,893 +0,0 @@
|
||||
//! This module contains logic for decoding and interpreting
|
||||
//! raw bytes which represent a raw Postgres WAL record.
|
||||
|
||||
use crate::models::*;
|
||||
use crate::serialized_batch::SerializedValueBatch;
|
||||
use bytes::{Buf, Bytes};
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
|
||||
use postgres_ffi::walrecord::*;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
impl InterpretedWalRecord {
|
||||
/// Decode and interpreted raw bytes which represent one Postgres WAL record.
|
||||
/// Data blocks which do not match the provided shard identity are filtered out.
|
||||
/// Shard 0 is a special case since it tracks all relation sizes. We only give it
|
||||
/// the keys that are being written as that is enough for updating relation sizes.
|
||||
pub fn from_bytes_filtered(
|
||||
buf: Bytes,
|
||||
shard: &ShardIdentity,
|
||||
record_end_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<InterpretedWalRecord> {
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
decode_wal_record(buf, &mut decoded, pg_version)?;
|
||||
let xid = decoded.xl_xid;
|
||||
|
||||
let flush_uncommitted = if decoded.is_dbase_create_copy(pg_version) {
|
||||
FlushUncommittedRecords::Yes
|
||||
} else {
|
||||
FlushUncommittedRecords::No
|
||||
};
|
||||
|
||||
let metadata_record = MetadataRecord::from_decoded(&decoded, record_end_lsn, pg_version)?;
|
||||
let batch = SerializedValueBatch::from_decoded_filtered(
|
||||
decoded,
|
||||
shard,
|
||||
record_end_lsn,
|
||||
pg_version,
|
||||
)?;
|
||||
|
||||
Ok(InterpretedWalRecord {
|
||||
metadata_record,
|
||||
batch,
|
||||
end_lsn: record_end_lsn,
|
||||
flush_uncommitted,
|
||||
xid,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl MetadataRecord {
|
||||
fn from_decoded(
|
||||
decoded: &DecodedWALRecord,
|
||||
record_end_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
// Note: this doesn't actually copy the bytes since
|
||||
// the [`Bytes`] type implements it via a level of indirection.
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance(decoded.main_data_offset);
|
||||
|
||||
match decoded.xl_rmid {
|
||||
pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
|
||||
Self::decode_heapam_record(&mut buf, decoded, pg_version)
|
||||
}
|
||||
pg_constants::RM_NEON_ID => Self::decode_neonmgr_record(&mut buf, decoded, pg_version),
|
||||
// Handle other special record types
|
||||
pg_constants::RM_SMGR_ID => Self::decode_smgr_record(&mut buf, decoded),
|
||||
pg_constants::RM_DBASE_ID => Self::decode_dbase_record(&mut buf, decoded, pg_version),
|
||||
pg_constants::RM_TBLSPC_ID => {
|
||||
tracing::trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
|
||||
Ok(None)
|
||||
}
|
||||
pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version),
|
||||
pg_constants::RM_XACT_ID => Self::decode_xact_record(&mut buf, decoded, record_end_lsn),
|
||||
pg_constants::RM_MULTIXACT_ID => {
|
||||
Self::decode_multixact_record(&mut buf, decoded, pg_version)
|
||||
}
|
||||
pg_constants::RM_RELMAP_ID => Self::decode_relmap_record(&mut buf, decoded),
|
||||
// This is an odd duck. It needs to go to all shards.
|
||||
// Since it uses the checkpoint image (that's initialized from CHECKPOINT_KEY
|
||||
// in WalIngest::new), we have to send the whole DecodedWalRecord::record to
|
||||
// the pageserver and decode it there.
|
||||
//
|
||||
// Alternatively, one can make the checkpoint part of the subscription protocol
|
||||
// to the pageserver. This should work fine, but can be done at a later point.
|
||||
pg_constants::RM_XLOG_ID => Self::decode_xlog_record(&mut buf, decoded, record_end_lsn),
|
||||
pg_constants::RM_LOGICALMSG_ID => {
|
||||
Self::decode_logical_message_record(&mut buf, decoded)
|
||||
}
|
||||
pg_constants::RM_STANDBY_ID => Self::decode_standby_record(&mut buf, decoded),
|
||||
pg_constants::RM_REPLORIGIN_ID => Self::decode_replorigin_record(&mut buf, decoded),
|
||||
_unexpected => {
|
||||
// TODO: consider failing here instead of blindly doing something without
|
||||
// understanding the protocol
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_heapam_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
// Handle VM bit updates that are implicitly part of heap records.
|
||||
|
||||
// First, look at the record to determine which VM bits need
|
||||
// to be cleared. If either of these variables is set, we
|
||||
// need to clear the corresponding bits in the visibility map.
|
||||
let mut new_heap_blkno: Option<u32> = None;
|
||||
let mut old_heap_blkno: Option<u32> = None;
|
||||
let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
|
||||
|
||||
match pg_version {
|
||||
14 => {
|
||||
if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
|
||||
if info == pg_constants::XLOG_HEAP_INSERT {
|
||||
let xlrec = v14::XlHeapInsert::decode(buf);
|
||||
assert_eq!(0, buf.remaining());
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_DELETE {
|
||||
let xlrec = v14::XlHeapDelete::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_UPDATE
|
||||
|| info == pg_constants::XLOG_HEAP_HOT_UPDATE
|
||||
{
|
||||
let xlrec = v14::XlHeapUpdate::decode(buf);
|
||||
// the size of tuple data is inferred from the size of the record.
|
||||
// we can't validate the remaining number of bytes without parsing
|
||||
// the tuple data.
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
|
||||
}
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
|
||||
// PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
|
||||
// non-HOT update where the new tuple goes to different page than
|
||||
// the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
|
||||
// set.
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_LOCK {
|
||||
let xlrec = v14::XlHeapLock::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
|
||||
}
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
|
||||
let xlrec = v14::XlHeapMultiInsert::decode(buf);
|
||||
|
||||
let offset_array_len =
|
||||
if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
|
||||
// the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
|
||||
0
|
||||
} else {
|
||||
size_of::<u16>() * xlrec.ntuples as usize
|
||||
};
|
||||
assert_eq!(offset_array_len, buf.remaining());
|
||||
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
|
||||
let xlrec = v14::XlHeapLockUpdated::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
|
||||
}
|
||||
}
|
||||
15 => {
|
||||
if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
|
||||
if info == pg_constants::XLOG_HEAP_INSERT {
|
||||
let xlrec = v15::XlHeapInsert::decode(buf);
|
||||
assert_eq!(0, buf.remaining());
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_DELETE {
|
||||
let xlrec = v15::XlHeapDelete::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_UPDATE
|
||||
|| info == pg_constants::XLOG_HEAP_HOT_UPDATE
|
||||
{
|
||||
let xlrec = v15::XlHeapUpdate::decode(buf);
|
||||
// the size of tuple data is inferred from the size of the record.
|
||||
// we can't validate the remaining number of bytes without parsing
|
||||
// the tuple data.
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
|
||||
}
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
|
||||
// PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
|
||||
// non-HOT update where the new tuple goes to different page than
|
||||
// the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
|
||||
// set.
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_LOCK {
|
||||
let xlrec = v15::XlHeapLock::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
|
||||
}
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
|
||||
let xlrec = v15::XlHeapMultiInsert::decode(buf);
|
||||
|
||||
let offset_array_len =
|
||||
if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
|
||||
// the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
|
||||
0
|
||||
} else {
|
||||
size_of::<u16>() * xlrec.ntuples as usize
|
||||
};
|
||||
assert_eq!(offset_array_len, buf.remaining());
|
||||
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
|
||||
let xlrec = v15::XlHeapLockUpdated::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
|
||||
}
|
||||
}
|
||||
16 => {
|
||||
if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
|
||||
if info == pg_constants::XLOG_HEAP_INSERT {
|
||||
let xlrec = v16::XlHeapInsert::decode(buf);
|
||||
assert_eq!(0, buf.remaining());
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_DELETE {
|
||||
let xlrec = v16::XlHeapDelete::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_UPDATE
|
||||
|| info == pg_constants::XLOG_HEAP_HOT_UPDATE
|
||||
{
|
||||
let xlrec = v16::XlHeapUpdate::decode(buf);
|
||||
// the size of tuple data is inferred from the size of the record.
|
||||
// we can't validate the remaining number of bytes without parsing
|
||||
// the tuple data.
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
|
||||
}
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
|
||||
// PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
|
||||
// non-HOT update where the new tuple goes to different page than
|
||||
// the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
|
||||
// set.
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_LOCK {
|
||||
let xlrec = v16::XlHeapLock::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
|
||||
}
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
|
||||
let xlrec = v16::XlHeapMultiInsert::decode(buf);
|
||||
|
||||
let offset_array_len =
|
||||
if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
|
||||
// the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
|
||||
0
|
||||
} else {
|
||||
size_of::<u16>() * xlrec.ntuples as usize
|
||||
};
|
||||
assert_eq!(offset_array_len, buf.remaining());
|
||||
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
|
||||
let xlrec = v16::XlHeapLockUpdated::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
|
||||
}
|
||||
}
|
||||
17 => {
|
||||
if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
|
||||
if info == pg_constants::XLOG_HEAP_INSERT {
|
||||
let xlrec = v17::XlHeapInsert::decode(buf);
|
||||
assert_eq!(0, buf.remaining());
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_DELETE {
|
||||
let xlrec = v17::XlHeapDelete::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_UPDATE
|
||||
|| info == pg_constants::XLOG_HEAP_HOT_UPDATE
|
||||
{
|
||||
let xlrec = v17::XlHeapUpdate::decode(buf);
|
||||
// the size of tuple data is inferred from the size of the record.
|
||||
// we can't validate the remaining number of bytes without parsing
|
||||
// the tuple data.
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
|
||||
}
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
|
||||
// PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
|
||||
// non-HOT update where the new tuple goes to different page than
|
||||
// the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
|
||||
// set.
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP_LOCK {
|
||||
let xlrec = v17::XlHeapLock::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
|
||||
}
|
||||
}
|
||||
} else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
|
||||
let xlrec = v17::XlHeapMultiInsert::decode(buf);
|
||||
|
||||
let offset_array_len =
|
||||
if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
|
||||
// the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
|
||||
0
|
||||
} else {
|
||||
size_of::<u16>() * xlrec.ntuples as usize
|
||||
};
|
||||
assert_eq!(offset_array_len, buf.remaining());
|
||||
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
} else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
|
||||
let xlrec = v17::XlHeapLockUpdated::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
|
||||
let vm_rel = RelTag {
|
||||
forknum: VISIBILITYMAP_FORKNUM,
|
||||
spcnode: decoded.blocks[0].rnode_spcnode,
|
||||
dbnode: decoded.blocks[0].rnode_dbnode,
|
||||
relnode: decoded.blocks[0].rnode_relnode,
|
||||
};
|
||||
|
||||
Ok(Some(MetadataRecord::Heapam(HeapamRecord::ClearVmBits(
|
||||
ClearVmBits {
|
||||
new_heap_blkno,
|
||||
old_heap_blkno,
|
||||
vm_rel,
|
||||
flags,
|
||||
},
|
||||
))))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_neonmgr_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
// Handle VM bit updates that are implicitly part of heap records.
|
||||
|
||||
// First, look at the record to determine which VM bits need
|
||||
// to be cleared. If either of these variables is set, we
|
||||
// need to clear the corresponding bits in the visibility map.
|
||||
let mut new_heap_blkno: Option<u32> = None;
|
||||
let mut old_heap_blkno: Option<u32> = None;
|
||||
let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
|
||||
|
||||
assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);
|
||||
|
||||
match pg_version {
|
||||
16 | 17 => {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
|
||||
|
||||
match info {
|
||||
pg_constants::XLOG_NEON_HEAP_INSERT => {
|
||||
let xlrec = v17::rm_neon::XlNeonHeapInsert::decode(buf);
|
||||
assert_eq!(0, buf.remaining());
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
}
|
||||
pg_constants::XLOG_NEON_HEAP_DELETE => {
|
||||
let xlrec = v17::rm_neon::XlNeonHeapDelete::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
}
|
||||
pg_constants::XLOG_NEON_HEAP_UPDATE
|
||||
| pg_constants::XLOG_NEON_HEAP_HOT_UPDATE => {
|
||||
let xlrec = v17::rm_neon::XlNeonHeapUpdate::decode(buf);
|
||||
// the size of tuple data is inferred from the size of the record.
|
||||
// we can't validate the remaining number of bytes without parsing
|
||||
// the tuple data.
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
|
||||
}
|
||||
if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
|
||||
// PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
|
||||
// non-HOT update where the new tuple goes to different page than
|
||||
// the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
|
||||
// set.
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
}
|
||||
pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => {
|
||||
let xlrec = v17::rm_neon::XlNeonHeapMultiInsert::decode(buf);
|
||||
|
||||
let offset_array_len =
|
||||
if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
|
||||
// the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
|
||||
0
|
||||
} else {
|
||||
size_of::<u16>() * xlrec.ntuples as usize
|
||||
};
|
||||
assert_eq!(offset_array_len, buf.remaining());
|
||||
|
||||
if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
|
||||
new_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
}
|
||||
}
|
||||
pg_constants::XLOG_NEON_HEAP_LOCK => {
|
||||
let xlrec = v17::rm_neon::XlNeonHeapLock::decode(buf);
|
||||
if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
|
||||
old_heap_blkno = Some(decoded.blocks[0].blkno);
|
||||
flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
|
||||
}
|
||||
}
|
||||
info => anyhow::bail!("Unknown WAL record type for Neon RMGR: {}", info),
|
||||
}
|
||||
}
|
||||
_ => anyhow::bail!(
|
||||
"Neon RMGR has no known compatibility with PostgreSQL version {}",
|
||||
pg_version
|
||||
),
|
||||
}
|
||||
|
||||
if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
|
||||
let vm_rel = RelTag {
|
||||
forknum: VISIBILITYMAP_FORKNUM,
|
||||
spcnode: decoded.blocks[0].rnode_spcnode,
|
||||
dbnode: decoded.blocks[0].rnode_dbnode,
|
||||
relnode: decoded.blocks[0].rnode_relnode,
|
||||
};
|
||||
|
||||
Ok(Some(MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(
|
||||
ClearVmBits {
|
||||
new_heap_blkno,
|
||||
old_heap_blkno,
|
||||
vm_rel,
|
||||
flags,
|
||||
},
|
||||
))))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_smgr_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_SMGR_CREATE {
|
||||
let create = XlSmgrCreate::decode(buf);
|
||||
let rel = RelTag {
|
||||
spcnode: create.rnode.spcnode,
|
||||
dbnode: create.rnode.dbnode,
|
||||
relnode: create.rnode.relnode,
|
||||
forknum: create.forknum,
|
||||
};
|
||||
|
||||
return Ok(Some(MetadataRecord::Smgr(SmgrRecord::Create(SmgrCreate {
|
||||
rel,
|
||||
}))));
|
||||
} else if info == pg_constants::XLOG_SMGR_TRUNCATE {
|
||||
let truncate = XlSmgrTruncate::decode(buf);
|
||||
return Ok(Some(MetadataRecord::Smgr(SmgrRecord::Truncate(truncate))));
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn decode_dbase_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
// TODO: Refactor this to avoid the duplication between postgres versions.
|
||||
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
tracing::debug!(%info, %pg_version, "handle RM_DBASE_ID");
|
||||
|
||||
if pg_version == 14 {
|
||||
if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
|
||||
let createdb = XlCreateDatabase::decode(buf);
|
||||
tracing::debug!("XLOG_DBASE_CREATE v14");
|
||||
|
||||
let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
|
||||
db_id: createdb.db_id,
|
||||
tablespace_id: createdb.tablespace_id,
|
||||
src_db_id: createdb.src_db_id,
|
||||
src_tablespace_id: createdb.src_tablespace_id,
|
||||
}));
|
||||
|
||||
return Ok(Some(record));
|
||||
} else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP {
|
||||
let dropdb = XlDropDatabase::decode(buf);
|
||||
|
||||
let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
|
||||
db_id: dropdb.db_id,
|
||||
tablespace_ids: dropdb.tablespace_ids,
|
||||
}));
|
||||
|
||||
return Ok(Some(record));
|
||||
}
|
||||
} else if pg_version == 15 {
|
||||
if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
|
||||
tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
|
||||
} else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
|
||||
// The XLOG record was renamed between v14 and v15,
|
||||
// but the record format is the same.
|
||||
// So we can reuse XlCreateDatabase here.
|
||||
tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");
|
||||
|
||||
let createdb = XlCreateDatabase::decode(buf);
|
||||
let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
|
||||
db_id: createdb.db_id,
|
||||
tablespace_id: createdb.tablespace_id,
|
||||
src_db_id: createdb.src_db_id,
|
||||
src_tablespace_id: createdb.src_tablespace_id,
|
||||
}));
|
||||
|
||||
return Ok(Some(record));
|
||||
} else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP {
|
||||
let dropdb = XlDropDatabase::decode(buf);
|
||||
let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
|
||||
db_id: dropdb.db_id,
|
||||
tablespace_ids: dropdb.tablespace_ids,
|
||||
}));
|
||||
|
||||
return Ok(Some(record));
|
||||
}
|
||||
} else if pg_version == 16 {
|
||||
if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
|
||||
tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
|
||||
} else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
|
||||
// The XLOG record was renamed between v14 and v15,
|
||||
// but the record format is the same.
|
||||
// So we can reuse XlCreateDatabase here.
|
||||
tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");
|
||||
|
||||
let createdb = XlCreateDatabase::decode(buf);
|
||||
let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
|
||||
db_id: createdb.db_id,
|
||||
tablespace_id: createdb.tablespace_id,
|
||||
src_db_id: createdb.src_db_id,
|
||||
src_tablespace_id: createdb.src_tablespace_id,
|
||||
}));
|
||||
|
||||
return Ok(Some(record));
|
||||
} else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP {
|
||||
let dropdb = XlDropDatabase::decode(buf);
|
||||
let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
|
||||
db_id: dropdb.db_id,
|
||||
tablespace_ids: dropdb.tablespace_ids,
|
||||
}));
|
||||
|
||||
return Ok(Some(record));
|
||||
}
|
||||
} else if pg_version == 17 {
|
||||
if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG {
|
||||
tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
|
||||
} else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY {
|
||||
// The XLOG record was renamed between v14 and v15,
|
||||
// but the record format is the same.
|
||||
// So we can reuse XlCreateDatabase here.
|
||||
tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");
|
||||
|
||||
let createdb = XlCreateDatabase::decode(buf);
|
||||
let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
|
||||
db_id: createdb.db_id,
|
||||
tablespace_id: createdb.tablespace_id,
|
||||
src_db_id: createdb.src_db_id,
|
||||
src_tablespace_id: createdb.src_tablespace_id,
|
||||
}));
|
||||
|
||||
return Ok(Some(record));
|
||||
} else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP {
|
||||
let dropdb = XlDropDatabase::decode(buf);
|
||||
let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
|
||||
db_id: dropdb.db_id,
|
||||
tablespace_ids: dropdb.tablespace_ids,
|
||||
}));
|
||||
|
||||
return Ok(Some(record));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn decode_clog_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
|
||||
|
||||
if info == pg_constants::CLOG_ZEROPAGE {
|
||||
let pageno = if pg_version < 17 {
|
||||
buf.get_u32_le()
|
||||
} else {
|
||||
buf.get_u64_le() as u32
|
||||
};
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
|
||||
Ok(Some(MetadataRecord::Clog(ClogRecord::ZeroPage(
|
||||
ClogZeroPage { segno, rpageno },
|
||||
))))
|
||||
} else {
|
||||
assert!(info == pg_constants::CLOG_TRUNCATE);
|
||||
let xlrec = XlClogTruncate::decode(buf, pg_version);
|
||||
|
||||
Ok(Some(MetadataRecord::Clog(ClogRecord::Truncate(
|
||||
ClogTruncate {
|
||||
pageno: xlrec.pageno,
|
||||
oldest_xid: xlrec.oldest_xid,
|
||||
oldest_xid_db: xlrec.oldest_xid_db,
|
||||
},
|
||||
))))
|
||||
}
|
||||
}
|
||||
|
||||
fn decode_xact_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
|
||||
let origin_id = decoded.origin_id;
|
||||
let xl_xid = decoded.xl_xid;
|
||||
|
||||
if info == pg_constants::XLOG_XACT_COMMIT {
|
||||
let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
|
||||
return Ok(Some(MetadataRecord::Xact(XactRecord::Commit(XactCommon {
|
||||
parsed,
|
||||
origin_id,
|
||||
xl_xid,
|
||||
lsn,
|
||||
}))));
|
||||
} else if info == pg_constants::XLOG_XACT_ABORT {
|
||||
let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
|
||||
return Ok(Some(MetadataRecord::Xact(XactRecord::Abort(XactCommon {
|
||||
parsed,
|
||||
origin_id,
|
||||
xl_xid,
|
||||
lsn,
|
||||
}))));
|
||||
} else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED {
|
||||
let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
|
||||
return Ok(Some(MetadataRecord::Xact(XactRecord::CommitPrepared(
|
||||
XactCommon {
|
||||
parsed,
|
||||
origin_id,
|
||||
xl_xid,
|
||||
lsn,
|
||||
},
|
||||
))));
|
||||
} else if info == pg_constants::XLOG_XACT_ABORT_PREPARED {
|
||||
let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
|
||||
return Ok(Some(MetadataRecord::Xact(XactRecord::AbortPrepared(
|
||||
XactCommon {
|
||||
parsed,
|
||||
origin_id,
|
||||
xl_xid,
|
||||
lsn,
|
||||
},
|
||||
))));
|
||||
} else if info == pg_constants::XLOG_XACT_PREPARE {
|
||||
return Ok(Some(MetadataRecord::Xact(XactRecord::Prepare(
|
||||
XactPrepare {
|
||||
xl_xid: decoded.xl_xid,
|
||||
data: Bytes::copy_from_slice(&buf[..]),
|
||||
},
|
||||
))));
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn decode_multixact_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
|
||||
if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
|
||||
|| info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
|
||||
{
|
||||
let pageno = if pg_version < 17 {
|
||||
buf.get_u32_le()
|
||||
} else {
|
||||
buf.get_u64_le() as u32
|
||||
};
|
||||
let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
|
||||
|
||||
let slru_kind = match info {
|
||||
pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE => SlruKind::MultiXactOffsets,
|
||||
pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE => SlruKind::MultiXactMembers,
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
return Ok(Some(MetadataRecord::MultiXact(MultiXactRecord::ZeroPage(
|
||||
MultiXactZeroPage {
|
||||
slru_kind,
|
||||
segno,
|
||||
rpageno,
|
||||
},
|
||||
))));
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
|
||||
let xlrec = XlMultiXactCreate::decode(buf);
|
||||
return Ok(Some(MetadataRecord::MultiXact(MultiXactRecord::Create(
|
||||
xlrec,
|
||||
))));
|
||||
} else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
|
||||
let xlrec = XlMultiXactTruncate::decode(buf);
|
||||
return Ok(Some(MetadataRecord::MultiXact(MultiXactRecord::Truncate(
|
||||
xlrec,
|
||||
))));
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn decode_relmap_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
let update = XlRelmapUpdate::decode(buf);
|
||||
|
||||
let mut buf = decoded.record.clone();
|
||||
buf.advance(decoded.main_data_offset);
|
||||
// skip xl_relmap_update
|
||||
buf.advance(12);
|
||||
|
||||
Ok(Some(MetadataRecord::Relmap(RelmapRecord::Update(
|
||||
RelmapUpdate {
|
||||
update,
|
||||
buf: Bytes::copy_from_slice(&buf[..]),
|
||||
},
|
||||
))))
|
||||
}
|
||||
|
||||
fn decode_xlog_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
lsn: Lsn,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
Ok(Some(MetadataRecord::Xlog(XlogRecord::Raw(RawXlogRecord {
|
||||
info,
|
||||
lsn,
|
||||
buf: buf.clone(),
|
||||
}))))
|
||||
}
|
||||
|
||||
fn decode_logical_message_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_LOGICAL_MESSAGE {
|
||||
let xlrec = XlLogicalMessage::decode(buf);
|
||||
let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
if prefix == "neon-test" {
|
||||
return Ok(Some(MetadataRecord::LogicalMessage(
|
||||
LogicalMessageRecord::Failpoint,
|
||||
)));
|
||||
}
|
||||
|
||||
if let Some(path) = prefix.strip_prefix("neon-file:") {
|
||||
let buf_size = xlrec.prefix_size + xlrec.message_size;
|
||||
let buf = Bytes::copy_from_slice(&buf[xlrec.prefix_size..buf_size]);
|
||||
return Ok(Some(MetadataRecord::LogicalMessage(
|
||||
LogicalMessageRecord::Put(PutLogicalMessage {
|
||||
path: path.to_string(),
|
||||
buf,
|
||||
}),
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn decode_standby_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_RUNNING_XACTS {
|
||||
let xlrec = XlRunningXacts::decode(buf);
|
||||
return Ok(Some(MetadataRecord::Standby(StandbyRecord::RunningXacts(
|
||||
StandbyRunningXacts {
|
||||
oldest_running_xid: xlrec.oldest_running_xid,
|
||||
},
|
||||
))));
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
fn decode_replorigin_record(
|
||||
buf: &mut Bytes,
|
||||
decoded: &DecodedWALRecord,
|
||||
) -> anyhow::Result<Option<MetadataRecord>> {
|
||||
let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
|
||||
if info == pg_constants::XLOG_REPLORIGIN_SET {
|
||||
let xlrec = XlReploriginSet::decode(buf);
|
||||
return Ok(Some(MetadataRecord::Replorigin(ReploriginRecord::Set(
|
||||
xlrec,
|
||||
))));
|
||||
} else if info == pg_constants::XLOG_REPLORIGIN_DROP {
|
||||
let xlrec = XlReploriginDrop::decode(buf);
|
||||
return Ok(Some(MetadataRecord::Replorigin(ReploriginRecord::Drop(
|
||||
xlrec,
|
||||
))));
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
@@ -1,3 +0,0 @@
|
||||
pub mod decoder;
|
||||
pub mod models;
|
||||
pub mod serialized_batch;
|
||||
@@ -1,211 +0,0 @@
|
||||
//! This module houses types which represent decoded PG WAL records
|
||||
//! ready for the pageserver to interpret. They are derived from the original
|
||||
//! WAL records, so that each struct corresponds closely to one WAL record of
|
||||
//! a specific kind. They contain the same information as the original WAL records,
|
||||
//! but the values are already serialized in a [`SerializedValueBatch`], which
|
||||
//! is the format that the pageserver is expecting them in.
|
||||
//!
|
||||
//! The ingestion code uses these structs to help with parsing the WAL records,
|
||||
//! and it splits them into a stream of modifications to the key-value pairs that
|
||||
//! are ultimately stored in delta layers. See also the split-out counterparts in
|
||||
//! [`postgres_ffi::walrecord`].
|
||||
//!
|
||||
//! The pipeline which processes WAL records is not super obvious, so let's follow
|
||||
//! the flow of an example XACT_COMMIT Postgres record:
|
||||
//!
|
||||
//! (Postgres XACT_COMMIT record)
|
||||
//! |
|
||||
//! |--> pageserver::walingest::WalIngest::decode_xact_record
|
||||
//! |
|
||||
//! |--> ([`XactRecord::Commit`])
|
||||
//! |
|
||||
//! |--> pageserver::walingest::WalIngest::ingest_xact_record
|
||||
//! |
|
||||
//! |--> (NeonWalRecord::ClogSetCommitted)
|
||||
//! |
|
||||
//! |--> write to KV store within the pageserver
|
||||
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use postgres_ffi::walrecord::{
|
||||
XlMultiXactCreate, XlMultiXactTruncate, XlRelmapUpdate, XlReploriginDrop, XlReploriginSet,
|
||||
XlSmgrTruncate, XlXactParsedRecord,
|
||||
};
|
||||
use postgres_ffi::{Oid, TransactionId};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::serialized_batch::SerializedValueBatch;
|
||||
|
||||
pub enum FlushUncommittedRecords {
|
||||
Yes,
|
||||
No,
|
||||
}
|
||||
|
||||
/// An interpreted Postgres WAL record, ready to be handled by the pageserver
|
||||
pub struct InterpretedWalRecord {
|
||||
/// Optional metadata record - may cause writes to metadata keys
|
||||
/// in the storage engine
|
||||
pub metadata_record: Option<MetadataRecord>,
|
||||
/// A pre-serialized batch along with the required metadata for ingestion
|
||||
/// by the pageserver
|
||||
pub batch: SerializedValueBatch,
|
||||
/// Byte offset within WAL for the end of the original PG WAL record
|
||||
pub end_lsn: Lsn,
|
||||
/// Whether to flush all uncommitted modifications to the storage engine
|
||||
/// before ingesting this record. This is currently only used for legacy PG
|
||||
/// database creations which read pages from a template database. Such WAL
|
||||
/// records require reading data blocks while ingesting, hence the need to flush.
|
||||
pub flush_uncommitted: FlushUncommittedRecords,
|
||||
/// Transaction id of the original PG WAL record
|
||||
pub xid: TransactionId,
|
||||
}
|
||||
|
||||
/// The interpreted part of the Postgres WAL record which requires metadata
|
||||
/// writes to the underlying storage engine.
|
||||
pub enum MetadataRecord {
|
||||
Heapam(HeapamRecord),
|
||||
Neonrmgr(NeonrmgrRecord),
|
||||
Smgr(SmgrRecord),
|
||||
Dbase(DbaseRecord),
|
||||
Clog(ClogRecord),
|
||||
Xact(XactRecord),
|
||||
MultiXact(MultiXactRecord),
|
||||
Relmap(RelmapRecord),
|
||||
Xlog(XlogRecord),
|
||||
LogicalMessage(LogicalMessageRecord),
|
||||
Standby(StandbyRecord),
|
||||
Replorigin(ReploriginRecord),
|
||||
}
|
||||
|
||||
pub enum HeapamRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
pub struct ClearVmBits {
|
||||
pub new_heap_blkno: Option<u32>,
|
||||
pub old_heap_blkno: Option<u32>,
|
||||
pub vm_rel: RelTag,
|
||||
pub flags: u8,
|
||||
}
|
||||
|
||||
pub enum NeonrmgrRecord {
|
||||
ClearVmBits(ClearVmBits),
|
||||
}
|
||||
|
||||
pub enum SmgrRecord {
|
||||
Create(SmgrCreate),
|
||||
Truncate(XlSmgrTruncate),
|
||||
}
|
||||
|
||||
pub struct SmgrCreate {
|
||||
pub rel: RelTag,
|
||||
}
|
||||
|
||||
pub enum DbaseRecord {
|
||||
Create(DbaseCreate),
|
||||
Drop(DbaseDrop),
|
||||
}
|
||||
|
||||
pub struct DbaseCreate {
|
||||
pub db_id: Oid,
|
||||
pub tablespace_id: Oid,
|
||||
pub src_db_id: Oid,
|
||||
pub src_tablespace_id: Oid,
|
||||
}
|
||||
|
||||
pub struct DbaseDrop {
|
||||
pub db_id: Oid,
|
||||
pub tablespace_ids: Vec<Oid>,
|
||||
}
|
||||
|
||||
pub enum ClogRecord {
|
||||
ZeroPage(ClogZeroPage),
|
||||
Truncate(ClogTruncate),
|
||||
}
|
||||
|
||||
pub struct ClogZeroPage {
|
||||
pub segno: u32,
|
||||
pub rpageno: u32,
|
||||
}
|
||||
|
||||
pub struct ClogTruncate {
|
||||
pub pageno: u32,
|
||||
pub oldest_xid: TransactionId,
|
||||
pub oldest_xid_db: Oid,
|
||||
}
|
||||
|
||||
pub enum XactRecord {
|
||||
Commit(XactCommon),
|
||||
Abort(XactCommon),
|
||||
CommitPrepared(XactCommon),
|
||||
AbortPrepared(XactCommon),
|
||||
Prepare(XactPrepare),
|
||||
}
|
||||
|
||||
pub struct XactCommon {
|
||||
pub parsed: XlXactParsedRecord,
|
||||
pub origin_id: u16,
|
||||
// Fields below are only used for logging
|
||||
pub xl_xid: TransactionId,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
pub struct XactPrepare {
|
||||
pub xl_xid: TransactionId,
|
||||
pub data: Bytes,
|
||||
}
|
||||
|
||||
pub enum MultiXactRecord {
|
||||
ZeroPage(MultiXactZeroPage),
|
||||
Create(XlMultiXactCreate),
|
||||
Truncate(XlMultiXactTruncate),
|
||||
}
|
||||
|
||||
pub struct MultiXactZeroPage {
|
||||
pub slru_kind: SlruKind,
|
||||
pub segno: u32,
|
||||
pub rpageno: u32,
|
||||
}
|
||||
|
||||
pub enum RelmapRecord {
|
||||
Update(RelmapUpdate),
|
||||
}
|
||||
|
||||
pub struct RelmapUpdate {
|
||||
pub update: XlRelmapUpdate,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
pub enum XlogRecord {
|
||||
Raw(RawXlogRecord),
|
||||
}
|
||||
|
||||
pub struct RawXlogRecord {
|
||||
pub info: u8,
|
||||
pub lsn: Lsn,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
pub enum LogicalMessageRecord {
|
||||
Put(PutLogicalMessage),
|
||||
#[cfg(feature = "testing")]
|
||||
Failpoint,
|
||||
}
|
||||
|
||||
pub struct PutLogicalMessage {
|
||||
pub path: String,
|
||||
pub buf: Bytes,
|
||||
}
|
||||
|
||||
pub enum StandbyRecord {
|
||||
RunningXacts(StandbyRunningXacts),
|
||||
}
|
||||
|
||||
pub struct StandbyRunningXacts {
|
||||
pub oldest_running_xid: TransactionId,
|
||||
}
|
||||
|
||||
pub enum ReploriginRecord {
|
||||
Set(XlReploriginSet),
|
||||
Drop(XlReploriginDrop),
|
||||
}
|
||||
@@ -1,862 +0,0 @@
|
||||
//! This module implements batch type for serialized [`pageserver_api::value::Value`]
|
||||
//! instances. Each batch contains a raw buffer (serialized values)
|
||||
//! and a list of metadata for each (key, LSN) tuple present in the batch.
|
||||
//!
|
||||
//! Such batches are created from decoded PG wal records and ingested
|
||||
//! by the pageserver by writing directly to the ephemeral file.
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use pageserver_api::key::rel_block_to_key;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::RelTag;
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::{key::CompactKey, value::Value};
|
||||
use postgres_ffi::walrecord::{DecodedBkpBlock, DecodedWALRecord};
|
||||
use postgres_ffi::{page_is_new, page_set_lsn, pg_constants, BLCKSZ};
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use pageserver_api::key::Key;
|
||||
|
||||
static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
|
||||
/// Accompanying metadata for the batch
|
||||
/// A value may be serialized and stored into the batch or just "observed".
|
||||
/// Shard 0 currently "observes" all values in order to accurately track
|
||||
/// relation sizes. In the case of "observed" values, we only need to know
|
||||
/// the key and LSN, so two types of metadata are supported to save on network
|
||||
/// bandwidth.
|
||||
pub enum ValueMeta {
|
||||
Serialized(SerializedValueMeta),
|
||||
Observed(ObservedValueMeta),
|
||||
}
|
||||
|
||||
impl ValueMeta {
|
||||
pub fn key(&self) -> CompactKey {
|
||||
match self {
|
||||
Self::Serialized(ser) => ser.key,
|
||||
Self::Observed(obs) => obs.key,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn lsn(&self) -> Lsn {
|
||||
match self {
|
||||
Self::Serialized(ser) => ser.lsn,
|
||||
Self::Observed(obs) => obs.lsn,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper around [`ValueMeta`] that implements ordering by
|
||||
/// (key, LSN) tuples
|
||||
struct OrderedValueMeta(ValueMeta);
|
||||
|
||||
impl Ord for OrderedValueMeta {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
(self.0.key(), self.0.lsn()).cmp(&(other.0.key(), other.0.lsn()))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialOrd for OrderedValueMeta {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for OrderedValueMeta {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
(self.0.key(), self.0.lsn()) == (other.0.key(), other.0.lsn())
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for OrderedValueMeta {}
|
||||
|
||||
/// Metadata for a [`Value`] serialized into the batch.
|
||||
pub struct SerializedValueMeta {
|
||||
pub key: CompactKey,
|
||||
pub lsn: Lsn,
|
||||
/// Starting offset of the value for the (key, LSN) tuple
|
||||
/// in [`SerializedValueBatch::raw`]
|
||||
pub batch_offset: u64,
|
||||
pub len: usize,
|
||||
pub will_init: bool,
|
||||
}
|
||||
|
||||
/// Metadata for a [`Value`] observed by the batch
|
||||
pub struct ObservedValueMeta {
|
||||
pub key: CompactKey,
|
||||
pub lsn: Lsn,
|
||||
}
|
||||
|
||||
/// Batch of serialized [`Value`]s.
|
||||
pub struct SerializedValueBatch {
|
||||
/// [`Value`]s serialized in EphemeralFile's native format,
|
||||
/// ready for disk write by the pageserver
|
||||
pub raw: Vec<u8>,
|
||||
|
||||
/// Metadata to make sense of the bytes in [`Self::raw`]
|
||||
/// and represent "observed" values.
|
||||
///
|
||||
/// Invariant: Metadata entries for any given key are ordered
|
||||
/// by LSN. Note that entries for a key do not have to be contiguous.
|
||||
pub metadata: Vec<ValueMeta>,
|
||||
|
||||
/// The highest LSN of any value in the batch
|
||||
pub max_lsn: Lsn,
|
||||
|
||||
/// Number of values encoded by [`Self::raw`]
|
||||
pub len: usize,
|
||||
}
|
||||
|
||||
impl Default for SerializedValueBatch {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
raw: Default::default(),
|
||||
metadata: Default::default(),
|
||||
max_lsn: Lsn(0),
|
||||
len: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SerializedValueBatch {
|
||||
/// Build a batch of serialized values from a decoded PG WAL record
|
||||
///
|
||||
/// The batch will only contain values for keys targeting the specifiec
|
||||
/// shard. Shard 0 is a special case, where any keys that don't belong to
|
||||
/// it are "observed" by the batch (i.e. present in [`SerializedValueBatch::metadata`],
|
||||
/// but absent from the raw buffer [`SerializedValueBatch::raw`]).
|
||||
pub(crate) fn from_decoded_filtered(
|
||||
decoded: DecodedWALRecord,
|
||||
shard: &ShardIdentity,
|
||||
record_end_lsn: Lsn,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<SerializedValueBatch> {
|
||||
// First determine how big the buffer needs to be and allocate it up-front.
|
||||
// This duplicates some of the work below, but it's empirically much faster.
|
||||
let estimated_buffer_size = Self::estimate_buffer_size(&decoded, shard, pg_version);
|
||||
let mut buf = Vec::<u8>::with_capacity(estimated_buffer_size);
|
||||
|
||||
let mut metadata: Vec<ValueMeta> = Vec::with_capacity(decoded.blocks.len());
|
||||
let mut max_lsn: Lsn = Lsn(0);
|
||||
let mut len: usize = 0;
|
||||
for blk in decoded.blocks.iter() {
|
||||
let relative_off = buf.len() as u64;
|
||||
|
||||
let rel = RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum,
|
||||
};
|
||||
|
||||
let key = rel_block_to_key(rel, blk.blkno);
|
||||
|
||||
if !key.is_valid_key_on_write_path() {
|
||||
anyhow::bail!("Unsupported key decoded at LSN {}: {}", record_end_lsn, key);
|
||||
}
|
||||
|
||||
let key_is_local = shard.is_key_local(&key);
|
||||
|
||||
tracing::debug!(
|
||||
lsn=%record_end_lsn,
|
||||
key=%key,
|
||||
"ingest: shard decision {}",
|
||||
if !key_is_local { "drop" } else { "keep" },
|
||||
);
|
||||
|
||||
if !key_is_local {
|
||||
if shard.is_shard_zero() {
|
||||
// Shard 0 tracks relation sizes. Although we will not store this block, we will observe
|
||||
// its blkno in case it implicitly extends a relation.
|
||||
metadata.push(ValueMeta::Observed(ObservedValueMeta {
|
||||
key: key.to_compact(),
|
||||
lsn: record_end_lsn,
|
||||
}))
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
// Instead of storing full-page-image WAL record,
|
||||
// it is better to store extracted image: we can skip wal-redo
|
||||
// in this case. Also some FPI records may contain multiple (up to 32) pages,
|
||||
// so them have to be copied multiple times.
|
||||
//
|
||||
let val = if Self::block_is_image(&decoded, blk, pg_version) {
|
||||
// Extract page image from FPI record
|
||||
let img_len = blk.bimg_len as usize;
|
||||
let img_offs = blk.bimg_offset as usize;
|
||||
let mut image = BytesMut::with_capacity(BLCKSZ as usize);
|
||||
// TODO(vlad): skip the copy
|
||||
image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
|
||||
|
||||
if blk.hole_length != 0 {
|
||||
let tail = image.split_off(blk.hole_offset as usize);
|
||||
image.resize(image.len() + blk.hole_length as usize, 0u8);
|
||||
image.unsplit(tail);
|
||||
}
|
||||
//
|
||||
// Match the logic of XLogReadBufferForRedoExtended:
|
||||
// The page may be uninitialized. If so, we can't set the LSN because
|
||||
// that would corrupt the page.
|
||||
//
|
||||
if !page_is_new(&image) {
|
||||
page_set_lsn(&mut image, record_end_lsn)
|
||||
}
|
||||
assert_eq!(image.len(), BLCKSZ as usize);
|
||||
|
||||
Value::Image(image.freeze())
|
||||
} else {
|
||||
Value::WalRecord(NeonWalRecord::Postgres {
|
||||
will_init: blk.will_init || blk.apply_image,
|
||||
rec: decoded.record.clone(),
|
||||
})
|
||||
};
|
||||
|
||||
val.ser_into(&mut buf)
|
||||
.expect("Writing into in-memory buffer is infallible");
|
||||
|
||||
let val_ser_size = buf.len() - relative_off as usize;
|
||||
|
||||
metadata.push(ValueMeta::Serialized(SerializedValueMeta {
|
||||
key: key.to_compact(),
|
||||
lsn: record_end_lsn,
|
||||
batch_offset: relative_off,
|
||||
len: val_ser_size,
|
||||
will_init: val.will_init(),
|
||||
}));
|
||||
max_lsn = std::cmp::max(max_lsn, record_end_lsn);
|
||||
len += 1;
|
||||
}
|
||||
|
||||
if cfg!(any(debug_assertions, test)) {
|
||||
let batch = Self {
|
||||
raw: buf,
|
||||
metadata,
|
||||
max_lsn,
|
||||
len,
|
||||
};
|
||||
|
||||
batch.validate_lsn_order();
|
||||
|
||||
return Ok(batch);
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
raw: buf,
|
||||
metadata,
|
||||
max_lsn,
|
||||
len,
|
||||
})
|
||||
}
|
||||
|
||||
/// Look into the decoded PG WAL record and determine
|
||||
/// roughly how large the buffer for serialized values needs to be.
|
||||
fn estimate_buffer_size(
|
||||
decoded: &DecodedWALRecord,
|
||||
shard: &ShardIdentity,
|
||||
pg_version: u32,
|
||||
) -> usize {
|
||||
let mut estimate: usize = 0;
|
||||
|
||||
for blk in decoded.blocks.iter() {
|
||||
let rel = RelTag {
|
||||
spcnode: blk.rnode_spcnode,
|
||||
dbnode: blk.rnode_dbnode,
|
||||
relnode: blk.rnode_relnode,
|
||||
forknum: blk.forknum,
|
||||
};
|
||||
|
||||
let key = rel_block_to_key(rel, blk.blkno);
|
||||
|
||||
if !shard.is_key_local(&key) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if Self::block_is_image(decoded, blk, pg_version) {
|
||||
// 4 bytes for the Value::Image discriminator
|
||||
// 8 bytes for encoding the size of the buffer
|
||||
// BLCKSZ for the raw image
|
||||
estimate += (4 + 8 + BLCKSZ) as usize;
|
||||
} else {
|
||||
// 4 bytes for the Value::WalRecord discriminator
|
||||
// 4 bytes for the NeonWalRecord::Postgres discriminator
|
||||
// 1 bytes for NeonWalRecord::Postgres::will_init
|
||||
// 8 bytes for encoding the size of the buffer
|
||||
// length of the raw record
|
||||
estimate += 8 + 1 + 8 + decoded.record.len();
|
||||
}
|
||||
}
|
||||
|
||||
estimate
|
||||
}
|
||||
|
||||
fn block_is_image(decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, pg_version: u32) -> bool {
|
||||
blk.apply_image
|
||||
&& blk.has_image
|
||||
&& decoded.xl_rmid == pg_constants::RM_XLOG_ID
|
||||
&& (decoded.xl_info == pg_constants::XLOG_FPI
|
||||
|| decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
|
||||
// compression of WAL is not yet supported: fall back to storing the original WAL record
|
||||
&& !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)
|
||||
// do not materialize null pages because them most likely be soon replaced with real data
|
||||
&& blk.bimg_len != 0
|
||||
}
|
||||
|
||||
/// Encode a list of values and metadata into a serialized batch
|
||||
///
|
||||
/// This is used by the pageserver ingest code to conveniently generate
|
||||
/// batches for metadata writes.
|
||||
pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> Self {
|
||||
// Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
|
||||
// [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
|
||||
let buffer_size = batch.iter().map(|i| i.2).sum::<usize>();
|
||||
let mut buf = Vec::<u8>::with_capacity(buffer_size);
|
||||
|
||||
let mut metadata: Vec<ValueMeta> = Vec::with_capacity(batch.len());
|
||||
let mut max_lsn: Lsn = Lsn(0);
|
||||
let len = batch.len();
|
||||
for (key, lsn, val_ser_size, val) in batch {
|
||||
let relative_off = buf.len() as u64;
|
||||
|
||||
val.ser_into(&mut buf)
|
||||
.expect("Writing into in-memory buffer is infallible");
|
||||
|
||||
metadata.push(ValueMeta::Serialized(SerializedValueMeta {
|
||||
key,
|
||||
lsn,
|
||||
batch_offset: relative_off,
|
||||
len: val_ser_size,
|
||||
will_init: val.will_init(),
|
||||
}));
|
||||
max_lsn = std::cmp::max(max_lsn, lsn);
|
||||
}
|
||||
|
||||
// Assert that we didn't do any extra allocations while building buffer.
|
||||
debug_assert!(buf.len() <= buffer_size);
|
||||
|
||||
if cfg!(any(debug_assertions, test)) {
|
||||
let batch = Self {
|
||||
raw: buf,
|
||||
metadata,
|
||||
max_lsn,
|
||||
len,
|
||||
};
|
||||
|
||||
batch.validate_lsn_order();
|
||||
|
||||
return batch;
|
||||
}
|
||||
|
||||
Self {
|
||||
raw: buf,
|
||||
metadata,
|
||||
max_lsn,
|
||||
len,
|
||||
}
|
||||
}
|
||||
|
||||
/// Add one value to the batch
|
||||
///
|
||||
/// This is used by the pageserver ingest code to include metadata block
|
||||
/// updates for a single key.
|
||||
pub fn put(&mut self, key: CompactKey, value: Value, lsn: Lsn) {
|
||||
let relative_off = self.raw.len() as u64;
|
||||
value.ser_into(&mut self.raw).unwrap();
|
||||
|
||||
let val_ser_size = self.raw.len() - relative_off as usize;
|
||||
self.metadata
|
||||
.push(ValueMeta::Serialized(SerializedValueMeta {
|
||||
key,
|
||||
lsn,
|
||||
batch_offset: relative_off,
|
||||
len: val_ser_size,
|
||||
will_init: value.will_init(),
|
||||
}));
|
||||
|
||||
self.max_lsn = std::cmp::max(self.max_lsn, lsn);
|
||||
self.len += 1;
|
||||
|
||||
if cfg!(any(debug_assertions, test)) {
|
||||
self.validate_lsn_order();
|
||||
}
|
||||
}
|
||||
|
||||
/// Extend with the contents of another batch
|
||||
///
|
||||
/// One batch is generated for each decoded PG WAL record.
|
||||
/// They are then merged to accumulate reasonably sized writes.
|
||||
pub fn extend(&mut self, mut other: SerializedValueBatch) {
|
||||
let extend_batch_start_offset = self.raw.len() as u64;
|
||||
|
||||
self.raw.extend(other.raw);
|
||||
|
||||
// Shift the offsets in the batch we are extending with
|
||||
other.metadata.iter_mut().for_each(|meta| match meta {
|
||||
ValueMeta::Serialized(ser) => {
|
||||
ser.batch_offset += extend_batch_start_offset;
|
||||
if cfg!(debug_assertions) {
|
||||
let value_end = ser.batch_offset + ser.len as u64;
|
||||
assert!((value_end as usize) <= self.raw.len());
|
||||
}
|
||||
}
|
||||
ValueMeta::Observed(_) => {}
|
||||
});
|
||||
self.metadata.extend(other.metadata);
|
||||
|
||||
self.max_lsn = std::cmp::max(self.max_lsn, other.max_lsn);
|
||||
|
||||
self.len += other.len;
|
||||
|
||||
if cfg!(any(debug_assertions, test)) {
|
||||
self.validate_lsn_order();
|
||||
}
|
||||
}
|
||||
|
||||
/// Add zero images for the (key, LSN) tuples specified
|
||||
///
|
||||
/// PG versions below 16 do not zero out pages before extending
|
||||
/// a relation and may leave gaps. Such gaps need to be identified
|
||||
/// by the pageserver ingest logic and get patched up here.
|
||||
///
|
||||
/// Note that this function does not validate that the gaps have been
|
||||
/// identified correctly (it does not know relation sizes), so it's up
|
||||
/// to the call-site to do it properly.
|
||||
pub fn zero_gaps(&mut self, gaps: Vec<(KeySpace, Lsn)>) {
|
||||
// Implementation note:
|
||||
//
|
||||
// Values within [`SerializedValueBatch::raw`] do not have any ordering requirements,
|
||||
// but the metadata entries should be ordered properly (see
|
||||
// [`SerializedValueBatch::metadata`]).
|
||||
//
|
||||
// Exploiting this observation we do:
|
||||
// 1. Drain all the metadata entries into an ordered set.
|
||||
// The use of a BTreeSet keyed by (Key, Lsn) relies on the observation that Postgres never
|
||||
// includes more than one update to the same block in the same WAL record.
|
||||
// 2. For each (key, LSN) gap tuple, append a zero image to the raw buffer
|
||||
// and add an index entry to the ordered metadata set.
|
||||
// 3. Drain the ordered set back into a metadata vector
|
||||
|
||||
let mut ordered_metas = self
|
||||
.metadata
|
||||
.drain(..)
|
||||
.map(OrderedValueMeta)
|
||||
.collect::<BTreeSet<_>>();
|
||||
for (keyspace, lsn) in gaps {
|
||||
self.max_lsn = std::cmp::max(self.max_lsn, lsn);
|
||||
|
||||
for gap_range in keyspace.ranges {
|
||||
let mut key = gap_range.start;
|
||||
while key != gap_range.end {
|
||||
let relative_off = self.raw.len() as u64;
|
||||
|
||||
// TODO(vlad): Can we be cheeky and write only one zero image, and
|
||||
// make all index entries requiring a zero page point to it?
|
||||
// Alternatively, we can change the index entry format to represent zero pages
|
||||
// without writing them at all.
|
||||
Value::Image(ZERO_PAGE.clone())
|
||||
.ser_into(&mut self.raw)
|
||||
.unwrap();
|
||||
let val_ser_size = self.raw.len() - relative_off as usize;
|
||||
|
||||
ordered_metas.insert(OrderedValueMeta(ValueMeta::Serialized(
|
||||
SerializedValueMeta {
|
||||
key: key.to_compact(),
|
||||
lsn,
|
||||
batch_offset: relative_off,
|
||||
len: val_ser_size,
|
||||
will_init: true,
|
||||
},
|
||||
)));
|
||||
|
||||
self.len += 1;
|
||||
|
||||
key = key.next();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.metadata = ordered_metas.into_iter().map(|ord| ord.0).collect();
|
||||
|
||||
if cfg!(any(debug_assertions, test)) {
|
||||
self.validate_lsn_order();
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks if the batch is empty
|
||||
///
|
||||
/// A batch is empty when it contains no serialized values.
|
||||
/// Note that it may still contain observed values.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
let empty = self.raw.is_empty();
|
||||
|
||||
if cfg!(debug_assertions) && empty {
|
||||
assert!(self
|
||||
.metadata
|
||||
.iter()
|
||||
.all(|meta| matches!(meta, ValueMeta::Observed(_))));
|
||||
}
|
||||
|
||||
empty
|
||||
}
|
||||
|
||||
/// Returns the number of values serialized in the batch
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
/// Returns the size of the buffer wrapped by the batch
|
||||
pub fn buffer_size(&self) -> usize {
|
||||
self.raw.len()
|
||||
}
|
||||
|
||||
pub fn updates_key(&self, key: &Key) -> bool {
|
||||
self.metadata.iter().any(|meta| match meta {
|
||||
ValueMeta::Serialized(ser) => key.to_compact() == ser.key,
|
||||
ValueMeta::Observed(_) => false,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn validate_lsn_order(&self) {
|
||||
use std::collections::HashMap;
|
||||
|
||||
let mut last_seen_lsn_per_key: HashMap<CompactKey, Lsn> = HashMap::default();
|
||||
|
||||
for meta in self.metadata.iter() {
|
||||
let lsn = meta.lsn();
|
||||
let key = meta.key();
|
||||
|
||||
if let Some(prev_lsn) = last_seen_lsn_per_key.insert(key, lsn) {
|
||||
assert!(
|
||||
lsn >= prev_lsn,
|
||||
"Ordering violated by {}: {} < {}",
|
||||
Key::from_compact(key),
|
||||
lsn,
|
||||
prev_lsn
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "testing"))]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn validate_batch(
|
||||
batch: &SerializedValueBatch,
|
||||
values: &[(CompactKey, Lsn, usize, Value)],
|
||||
gaps: Option<&Vec<(KeySpace, Lsn)>>,
|
||||
) {
|
||||
// Invariant 1: The metadata for a given entry in the batch
|
||||
// is correct and can be used to deserialize back to the original value.
|
||||
for (key, lsn, size, value) in values.iter() {
|
||||
let meta = batch
|
||||
.metadata
|
||||
.iter()
|
||||
.find(|meta| (meta.key(), meta.lsn()) == (*key, *lsn))
|
||||
.unwrap();
|
||||
let meta = match meta {
|
||||
ValueMeta::Serialized(ser) => ser,
|
||||
ValueMeta::Observed(_) => unreachable!(),
|
||||
};
|
||||
|
||||
assert_eq!(meta.len, *size);
|
||||
assert_eq!(meta.will_init, value.will_init());
|
||||
|
||||
let start = meta.batch_offset as usize;
|
||||
let end = meta.batch_offset as usize + meta.len;
|
||||
let value_from_batch = Value::des(&batch.raw[start..end]).unwrap();
|
||||
assert_eq!(&value_from_batch, value);
|
||||
}
|
||||
|
||||
let mut expected_buffer_size: usize = values.iter().map(|(_, _, size, _)| size).sum();
|
||||
let mut gap_pages_count: usize = 0;
|
||||
|
||||
// Invariant 2: Zero pages were added for identified gaps and their metadata
|
||||
// is correct.
|
||||
if let Some(gaps) = gaps {
|
||||
for (gap_keyspace, lsn) in gaps {
|
||||
for gap_range in &gap_keyspace.ranges {
|
||||
let mut gap_key = gap_range.start;
|
||||
while gap_key != gap_range.end {
|
||||
let meta = batch
|
||||
.metadata
|
||||
.iter()
|
||||
.find(|meta| (meta.key(), meta.lsn()) == (gap_key.to_compact(), *lsn))
|
||||
.unwrap();
|
||||
let meta = match meta {
|
||||
ValueMeta::Serialized(ser) => ser,
|
||||
ValueMeta::Observed(_) => unreachable!(),
|
||||
};
|
||||
|
||||
let zero_value = Value::Image(ZERO_PAGE.clone());
|
||||
let zero_value_size = zero_value.serialized_size().unwrap() as usize;
|
||||
|
||||
assert_eq!(meta.len, zero_value_size);
|
||||
assert_eq!(meta.will_init, zero_value.will_init());
|
||||
|
||||
let start = meta.batch_offset as usize;
|
||||
let end = meta.batch_offset as usize + meta.len;
|
||||
let value_from_batch = Value::des(&batch.raw[start..end]).unwrap();
|
||||
assert_eq!(value_from_batch, zero_value);
|
||||
|
||||
gap_pages_count += 1;
|
||||
expected_buffer_size += zero_value_size;
|
||||
gap_key = gap_key.next();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Invariant 3: The length of the batch is equal to the number
|
||||
// of values inserted, plus the number of gap pages. This extends
|
||||
// to the raw buffer size.
|
||||
assert_eq!(batch.len(), values.len() + gap_pages_count);
|
||||
assert_eq!(expected_buffer_size, batch.buffer_size());
|
||||
|
||||
// Invariant 4: Metadata entries for any given key are sorted in LSN order.
|
||||
batch.validate_lsn_order();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_creation_from_values() {
|
||||
const LSN: Lsn = Lsn(0x10);
|
||||
let key = Key::from_hex("110000000033333333444444445500000001").unwrap();
|
||||
|
||||
let values = vec![
|
||||
(
|
||||
key.to_compact(),
|
||||
LSN,
|
||||
Value::WalRecord(NeonWalRecord::wal_append("foo")),
|
||||
),
|
||||
(
|
||||
key.next().to_compact(),
|
||||
LSN,
|
||||
Value::WalRecord(NeonWalRecord::wal_append("bar")),
|
||||
),
|
||||
(
|
||||
key.to_compact(),
|
||||
Lsn(LSN.0 + 0x10),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("baz")),
|
||||
),
|
||||
(
|
||||
key.next().next().to_compact(),
|
||||
LSN,
|
||||
Value::WalRecord(NeonWalRecord::wal_append("taz")),
|
||||
),
|
||||
];
|
||||
|
||||
let values = values
|
||||
.into_iter()
|
||||
.map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
|
||||
.collect::<Vec<_>>();
|
||||
let batch = SerializedValueBatch::from_values(values.clone());
|
||||
|
||||
validate_batch(&batch, &values, None);
|
||||
|
||||
assert!(!batch.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_put() {
|
||||
const LSN: Lsn = Lsn(0x10);
|
||||
let key = Key::from_hex("110000000033333333444444445500000001").unwrap();
|
||||
|
||||
let values = vec![
|
||||
(
|
||||
key.to_compact(),
|
||||
LSN,
|
||||
Value::WalRecord(NeonWalRecord::wal_append("foo")),
|
||||
),
|
||||
(
|
||||
key.next().to_compact(),
|
||||
LSN,
|
||||
Value::WalRecord(NeonWalRecord::wal_append("bar")),
|
||||
),
|
||||
];
|
||||
|
||||
let mut values = values
|
||||
.into_iter()
|
||||
.map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
|
||||
.collect::<Vec<_>>();
|
||||
let mut batch = SerializedValueBatch::from_values(values.clone());
|
||||
|
||||
validate_batch(&batch, &values, None);
|
||||
|
||||
let value = (
|
||||
key.to_compact(),
|
||||
Lsn(LSN.0 + 0x10),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("baz")),
|
||||
);
|
||||
let serialized_size = value.2.serialized_size().unwrap() as usize;
|
||||
let value = (value.0, value.1, serialized_size, value.2);
|
||||
values.push(value.clone());
|
||||
batch.put(value.0, value.3, value.1);
|
||||
|
||||
validate_batch(&batch, &values, None);
|
||||
|
||||
let value = (
|
||||
key.next().next().to_compact(),
|
||||
LSN,
|
||||
Value::WalRecord(NeonWalRecord::wal_append("taz")),
|
||||
);
|
||||
let serialized_size = value.2.serialized_size().unwrap() as usize;
|
||||
let value = (value.0, value.1, serialized_size, value.2);
|
||||
values.push(value.clone());
|
||||
batch.put(value.0, value.3, value.1);
|
||||
|
||||
validate_batch(&batch, &values, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extension() {
|
||||
const LSN: Lsn = Lsn(0x10);
|
||||
let key = Key::from_hex("110000000033333333444444445500000001").unwrap();
|
||||
|
||||
let values = vec![
|
||||
(
|
||||
key.to_compact(),
|
||||
LSN,
|
||||
Value::WalRecord(NeonWalRecord::wal_append("foo")),
|
||||
),
|
||||
(
|
||||
key.next().to_compact(),
|
||||
LSN,
|
||||
Value::WalRecord(NeonWalRecord::wal_append("bar")),
|
||||
),
|
||||
(
|
||||
key.next().next().to_compact(),
|
||||
LSN,
|
||||
Value::WalRecord(NeonWalRecord::wal_append("taz")),
|
||||
),
|
||||
];
|
||||
|
||||
let mut values = values
|
||||
.into_iter()
|
||||
.map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
|
||||
.collect::<Vec<_>>();
|
||||
let mut batch = SerializedValueBatch::from_values(values.clone());
|
||||
|
||||
let other_values = vec![
|
||||
(
|
||||
key.to_compact(),
|
||||
Lsn(LSN.0 + 0x10),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("foo")),
|
||||
),
|
||||
(
|
||||
key.next().to_compact(),
|
||||
Lsn(LSN.0 + 0x10),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("bar")),
|
||||
),
|
||||
(
|
||||
key.next().next().to_compact(),
|
||||
Lsn(LSN.0 + 0x10),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("taz")),
|
||||
),
|
||||
];
|
||||
|
||||
let other_values = other_values
|
||||
.into_iter()
|
||||
.map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
|
||||
.collect::<Vec<_>>();
|
||||
let other_batch = SerializedValueBatch::from_values(other_values.clone());
|
||||
|
||||
values.extend(other_values);
|
||||
batch.extend(other_batch);
|
||||
|
||||
validate_batch(&batch, &values, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_gap_zeroing() {
|
||||
const LSN: Lsn = Lsn(0x10);
|
||||
let rel_foo_base_key = Key::from_hex("110000000033333333444444445500000001").unwrap();
|
||||
|
||||
let rel_bar_base_key = {
|
||||
let mut key = rel_foo_base_key;
|
||||
key.field4 += 1;
|
||||
key
|
||||
};
|
||||
|
||||
let values = vec![
|
||||
(
|
||||
rel_foo_base_key.to_compact(),
|
||||
LSN,
|
||||
Value::WalRecord(NeonWalRecord::wal_append("foo1")),
|
||||
),
|
||||
(
|
||||
rel_foo_base_key.add(1).to_compact(),
|
||||
LSN,
|
||||
Value::WalRecord(NeonWalRecord::wal_append("foo2")),
|
||||
),
|
||||
(
|
||||
rel_foo_base_key.add(5).to_compact(),
|
||||
LSN,
|
||||
Value::WalRecord(NeonWalRecord::wal_append("foo3")),
|
||||
),
|
||||
(
|
||||
rel_foo_base_key.add(1).to_compact(),
|
||||
Lsn(LSN.0 + 0x10),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("foo4")),
|
||||
),
|
||||
(
|
||||
rel_foo_base_key.add(10).to_compact(),
|
||||
Lsn(LSN.0 + 0x10),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("foo5")),
|
||||
),
|
||||
(
|
||||
rel_foo_base_key.add(11).to_compact(),
|
||||
Lsn(LSN.0 + 0x10),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("foo6")),
|
||||
),
|
||||
(
|
||||
rel_foo_base_key.add(12).to_compact(),
|
||||
Lsn(LSN.0 + 0x10),
|
||||
Value::WalRecord(NeonWalRecord::wal_append("foo7")),
|
||||
),
|
||||
(
|
||||
rel_bar_base_key.to_compact(),
|
||||
LSN,
|
||||
Value::WalRecord(NeonWalRecord::wal_append("bar1")),
|
||||
),
|
||||
(
|
||||
rel_bar_base_key.add(4).to_compact(),
|
||||
LSN,
|
||||
Value::WalRecord(NeonWalRecord::wal_append("bar2")),
|
||||
),
|
||||
];
|
||||
|
||||
let values = values
|
||||
.into_iter()
|
||||
.map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let mut batch = SerializedValueBatch::from_values(values.clone());
|
||||
|
||||
let gaps = vec![
|
||||
(
|
||||
KeySpace {
|
||||
ranges: vec![
|
||||
rel_foo_base_key.add(2)..rel_foo_base_key.add(5),
|
||||
rel_bar_base_key.add(1)..rel_bar_base_key.add(4),
|
||||
],
|
||||
},
|
||||
LSN,
|
||||
),
|
||||
(
|
||||
KeySpace {
|
||||
ranges: vec![rel_foo_base_key.add(6)..rel_foo_base_key.add(10)],
|
||||
},
|
||||
Lsn(LSN.0 + 0x10),
|
||||
),
|
||||
];
|
||||
|
||||
batch.zero_gaps(gaps.clone());
|
||||
validate_batch(&batch, &values, Some(&gaps));
|
||||
}
|
||||
}
|
||||
@@ -8,7 +8,7 @@ license.workspace = true
|
||||
default = []
|
||||
# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
|
||||
# which adds some runtime cost to run tests on outage conditions
|
||||
testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"]
|
||||
testing = ["fail/failpoints", "pageserver_api/testing" ]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
@@ -83,7 +83,6 @@ enum-map.workspace = true
|
||||
enumset = { workspace = true, features = ["serde"]}
|
||||
strum.workspace = true
|
||||
strum_macros.workspace = true
|
||||
wal_decoder.workspace = true
|
||||
|
||||
[target.'cfg(target_os = "linux")'.dependencies]
|
||||
procfs.workspace = true
|
||||
|
||||
@@ -8,16 +8,17 @@ use pageserver::{
|
||||
context::{DownloadBehavior, RequestContext},
|
||||
l0_flush::{L0FlushConfig, L0FlushGlobalState},
|
||||
page_cache,
|
||||
repository::Value,
|
||||
task_mgr::TaskKind,
|
||||
tenant::storage_layer::inmemory_layer::SerializedBatch,
|
||||
tenant::storage_layer::InMemoryLayer,
|
||||
virtual_file,
|
||||
};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId, value::Value};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use utils::{
|
||||
bin_ser::BeSer,
|
||||
id::{TenantId, TimelineId},
|
||||
};
|
||||
use wal_decoder::serialized_batch::SerializedValueBatch;
|
||||
|
||||
// A very cheap hash for generating non-sequential keys.
|
||||
fn murmurhash32(mut h: u32) -> u32 {
|
||||
@@ -102,13 +103,13 @@ async fn ingest(
|
||||
batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
|
||||
if batch.len() >= BATCH_SIZE {
|
||||
let this_batch = std::mem::take(&mut batch);
|
||||
let serialized = SerializedValueBatch::from_values(this_batch);
|
||||
let serialized = SerializedBatch::from_values(this_batch).unwrap();
|
||||
layer.put_batch(serialized, &ctx).await?;
|
||||
}
|
||||
}
|
||||
if !batch.is_empty() {
|
||||
let this_batch = std::mem::take(&mut batch);
|
||||
let serialized = SerializedValueBatch::from_values(this_batch);
|
||||
let serialized = SerializedBatch::from_values(this_batch).unwrap();
|
||||
layer.put_batch(serialized, &ctx).await?;
|
||||
}
|
||||
layer.freeze(lsn + 1).await;
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use criterion::measurement::WallTime;
|
||||
use pageserver::keyspace::{KeyPartitioning, KeySpace};
|
||||
use pageserver::repository::Key;
|
||||
use pageserver::tenant::layer_map::LayerMap;
|
||||
use pageserver::tenant::storage_layer::LayerName;
|
||||
use pageserver::tenant::storage_layer::PersistentLayerDesc;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use rand::prelude::{SeedableRng, SliceRandom, StdRng};
|
||||
use std::cmp::{max, min};
|
||||
|
||||
@@ -60,8 +60,7 @@ use anyhow::Context;
|
||||
use bytes::{Buf, Bytes};
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver::{config::PageServerConf, walredo::PostgresRedoManager};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
||||
use pageserver_api::{key::Key, shard::TenantShardId};
|
||||
use std::{
|
||||
future::Future,
|
||||
|
||||
@@ -35,15 +35,6 @@ pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
|
||||
!(a.end <= b.start || b.end <= a.start)
|
||||
}
|
||||
|
||||
/// Whether a fully contains b, example as below
|
||||
/// ```plain
|
||||
/// | a |
|
||||
/// | b |
|
||||
/// ```
|
||||
pub fn fully_contains<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
|
||||
a.start <= b.start && a.end >= b.end
|
||||
}
|
||||
|
||||
pub fn union_to_keyspace<K: Ord>(a: &mut CompactionKeySpace<K>, b: CompactionKeySpace<K>) {
|
||||
let x = std::mem::take(a);
|
||||
let mut all_ranges_iter = [x.into_iter(), b.into_iter()]
|
||||
|
||||
@@ -51,7 +51,7 @@
|
||||
//!
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver::repository::Key;
|
||||
use std::cmp::Ordering;
|
||||
use std::io::{self, BufRead};
|
||||
use std::path::PathBuf;
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
//!
|
||||
//! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data.
|
||||
|
||||
use anyhow::{anyhow, Result};
|
||||
use anyhow::Result;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use pageserver::context::{DownloadBehavior, RequestContext};
|
||||
use pageserver::task_mgr::TaskKind;
|
||||
@@ -11,16 +11,15 @@ use pageserver::virtual_file::api::IoMode;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::ops::Range;
|
||||
use std::str::FromStr;
|
||||
use std::{fs, str};
|
||||
|
||||
use pageserver::page_cache::{self, PAGE_SZ};
|
||||
use pageserver::repository::{Key, KEY_SIZE};
|
||||
use pageserver::tenant::block_io::FileBlockReader;
|
||||
use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
|
||||
use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
|
||||
use pageserver::tenant::storage_layer::{range_overlaps, LayerName};
|
||||
use pageserver::tenant::storage_layer::range_overlaps;
|
||||
use pageserver::virtual_file::{self, VirtualFile};
|
||||
use pageserver_api::key::{Key, KEY_SIZE};
|
||||
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
|
||||
@@ -75,15 +74,35 @@ impl LayerFile {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn parse_filename(name: &str) -> anyhow::Result<LayerFile> {
|
||||
let layer_name =
|
||||
LayerName::from_str(name).map_err(|e| anyhow!("failed to parse layer name: {e}"))?;
|
||||
pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
|
||||
let split: Vec<&str> = name.split("__").collect();
|
||||
if split.len() != 2 {
|
||||
return None;
|
||||
}
|
||||
let keys: Vec<&str> = split[0].split('-').collect();
|
||||
let lsn_and_opt_generation: Vec<&str> = split[1].split('v').collect();
|
||||
let lsns: Vec<&str> = lsn_and_opt_generation[0].split('-').collect();
|
||||
let the_lsns: [&str; 2];
|
||||
|
||||
/*
|
||||
* Generations add a -vX-XXXXXX postfix, which causes issues when we try to
|
||||
* parse 'vX' as an LSN.
|
||||
*/
|
||||
let is_delta = if lsns.len() == 1 || lsns[1].is_empty() {
|
||||
the_lsns = [lsns[0], lsns[0]];
|
||||
false
|
||||
} else {
|
||||
the_lsns = [lsns[0], lsns[1]];
|
||||
true
|
||||
};
|
||||
|
||||
let key_range = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
|
||||
let lsn_range = Lsn::from_hex(the_lsns[0]).unwrap()..Lsn::from_hex(the_lsns[1]).unwrap();
|
||||
let holes = Vec::new();
|
||||
Ok(LayerFile {
|
||||
key_range: layer_name.key_range().clone(),
|
||||
lsn_range: layer_name.lsn_as_range(),
|
||||
is_delta: layer_name.is_delta(),
|
||||
Some(LayerFile {
|
||||
key_range,
|
||||
lsn_range,
|
||||
is_delta,
|
||||
holes,
|
||||
})
|
||||
}
|
||||
@@ -160,7 +179,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
|
||||
|
||||
for layer in fs::read_dir(timeline.path())? {
|
||||
let layer = layer?;
|
||||
if let Ok(mut layer_file) =
|
||||
if let Some(mut layer_file) =
|
||||
parse_filename(&layer.file_name().into_string().unwrap())
|
||||
{
|
||||
if layer_file.is_delta {
|
||||
|
||||
@@ -5,12 +5,24 @@ use camino::{Utf8Path, Utf8PathBuf};
|
||||
use clap::Subcommand;
|
||||
use pageserver::context::{DownloadBehavior, RequestContext};
|
||||
use pageserver::task_mgr::TaskKind;
|
||||
use pageserver::tenant::block_io::BlockCursor;
|
||||
use pageserver::tenant::disk_btree::DiskBtreeReader;
|
||||
use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
|
||||
use pageserver::tenant::storage_layer::{delta_layer, image_layer};
|
||||
use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
|
||||
use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
|
||||
use pageserver::virtual_file::api::IoMode;
|
||||
use pageserver::{page_cache, virtual_file};
|
||||
use std::fs::{self, File};
|
||||
use pageserver::{
|
||||
repository::{Key, KEY_SIZE},
|
||||
tenant::{
|
||||
block_io::FileBlockReader, disk_btree::VisitDirection,
|
||||
storage_layer::delta_layer::DELTA_KEY_SIZE,
|
||||
},
|
||||
virtual_file::VirtualFile,
|
||||
};
|
||||
use std::fs;
|
||||
use utils::bin_ser::BeSer;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use crate::layer_map_analyzer::parse_filename;
|
||||
@@ -47,30 +59,44 @@ pub(crate) enum LayerCmd {
|
||||
}
|
||||
|
||||
async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
||||
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
||||
virtual_file::init(
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
IoMode::preferred(),
|
||||
);
|
||||
page_cache::init(100);
|
||||
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
||||
let file = File::open(path)?;
|
||||
let delta_layer = DeltaLayer::new_for_path(path, file)?;
|
||||
delta_layer.dump(true, ctx).await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn read_image_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
|
||||
virtual_file::init(
|
||||
10,
|
||||
virtual_file::api::IoEngineKind::StdFs,
|
||||
IoMode::preferred(),
|
||||
let file = VirtualFile::open(path, ctx).await?;
|
||||
let file_id = page_cache::next_file_id();
|
||||
let block_reader = FileBlockReader::new(&file, file_id);
|
||||
let summary_blk = block_reader.read_blk(0, ctx).await?;
|
||||
let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
|
||||
let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
|
||||
actual_summary.index_start_blk,
|
||||
actual_summary.index_root_blk,
|
||||
&block_reader,
|
||||
);
|
||||
page_cache::init(100);
|
||||
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
|
||||
let file = File::open(path)?;
|
||||
let image_layer = ImageLayer::new_for_path(path, file)?;
|
||||
image_layer.dump(true, ctx).await?;
|
||||
// TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
|
||||
let mut all = vec![];
|
||||
tree_reader
|
||||
.visit(
|
||||
&[0u8; DELTA_KEY_SIZE],
|
||||
VisitDirection::Forwards,
|
||||
|key, value_offset| {
|
||||
let curr = Key::from_slice(&key[..KEY_SIZE]);
|
||||
all.push((curr, BlobRef(value_offset)));
|
||||
true
|
||||
},
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
let cursor = BlockCursor::new_fileblockreader(&block_reader);
|
||||
for (k, v) in all {
|
||||
let value = cursor.read_blob(v.pos(), ctx).await?;
|
||||
println!("key:{} value_len:{}", k, value.len());
|
||||
assert!(k.is_i128_representable(), "invalid key: ");
|
||||
}
|
||||
// TODO(chi): special handling for last key?
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -107,7 +133,8 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
let mut idx = 0;
|
||||
for layer in fs::read_dir(timeline_path)? {
|
||||
let layer = layer?;
|
||||
if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) {
|
||||
if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
|
||||
{
|
||||
println!(
|
||||
"[{:3}] key:{}-{}\n lsn:{}-{}\n delta:{}",
|
||||
idx,
|
||||
@@ -136,7 +163,8 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
let mut idx = 0;
|
||||
for layer in fs::read_dir(timeline_path)? {
|
||||
let layer = layer?;
|
||||
if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) {
|
||||
if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
|
||||
{
|
||||
if *id == idx {
|
||||
// TODO(chi): dedup code
|
||||
println!(
|
||||
@@ -152,7 +180,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
|
||||
if layer_file.is_delta {
|
||||
read_delta_file(layer.path(), &ctx).await?;
|
||||
} else {
|
||||
read_image_file(layer.path(), &ctx).await?;
|
||||
anyhow::bail!("not supported yet :(");
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use pageserver_api::models::{TenantConfig, TenantConfigRequest};
|
||||
use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest};
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use utils::id::TenantTimelineId;
|
||||
use utils::lsn::Lsn;
|
||||
@@ -66,7 +66,10 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
|
||||
mgmt_api_client
|
||||
.tenant_config(&TenantConfigRequest {
|
||||
tenant_id: timeline.tenant_id,
|
||||
config: TenantConfig::default(),
|
||||
config: TenantConfig {
|
||||
switch_aux_file_policy: Some(AuxFilePolicy::V2),
|
||||
..Default::default()
|
||||
},
|
||||
})
|
||||
.await?;
|
||||
|
||||
|
||||
@@ -19,8 +19,7 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
|
||||
| Scope::SafekeeperData
|
||||
| Scope::GenerationsApi
|
||||
| Scope::Infra
|
||||
| Scope::Scrubber
|
||||
| Scope::ControllerPeer,
|
||||
| Scope::Scrubber,
|
||||
_,
|
||||
) => Err(AuthError(
|
||||
format!(
|
||||
|
||||
@@ -59,7 +59,6 @@ pub async fn send_basebackup_tarball<'a, W>(
|
||||
req_lsn: Option<Lsn>,
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
replica: bool,
|
||||
ctx: &'a RequestContext,
|
||||
) -> Result<(), BasebackupError>
|
||||
where
|
||||
@@ -111,8 +110,8 @@ where
|
||||
};
|
||||
|
||||
info!(
|
||||
"taking basebackup lsn={}, prev_lsn={} (full_backup={}, replica={})",
|
||||
backup_lsn, prev_lsn, full_backup, replica
|
||||
"taking basebackup lsn={}, prev_lsn={} (full_backup={})",
|
||||
backup_lsn, prev_lsn, full_backup
|
||||
);
|
||||
|
||||
let basebackup = Basebackup {
|
||||
@@ -121,7 +120,6 @@ where
|
||||
lsn: backup_lsn,
|
||||
prev_record_lsn: prev_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
ctx,
|
||||
};
|
||||
basebackup
|
||||
@@ -142,7 +140,6 @@ where
|
||||
lsn: Lsn,
|
||||
prev_record_lsn: Lsn,
|
||||
full_backup: bool,
|
||||
replica: bool,
|
||||
ctx: &'a RequestContext,
|
||||
}
|
||||
|
||||
@@ -375,10 +372,6 @@ where
|
||||
|
||||
for (path, content) in aux_files {
|
||||
if path.starts_with("pg_replslot") {
|
||||
// Do not create LR slots at standby because they are not used but prevent WAL truncation
|
||||
if self.replica {
|
||||
continue;
|
||||
}
|
||||
let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
|
||||
let restart_lsn = Lsn(u64::from_le_bytes(
|
||||
content[offs..offs + 8].try_into().unwrap(),
|
||||
|
||||
@@ -154,17 +154,13 @@ fn main() -> anyhow::Result<()> {
|
||||
},
|
||||
};
|
||||
|
||||
if conf.no_sync {
|
||||
info!("Skipping syncfs on startup");
|
||||
} else {
|
||||
let started = Instant::now();
|
||||
syncfs(dirfd)?;
|
||||
let elapsed = started.elapsed();
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"made tenant directory contents durable"
|
||||
);
|
||||
}
|
||||
let started = Instant::now();
|
||||
syncfs(dirfd)?;
|
||||
let elapsed = started.elapsed();
|
||||
info!(
|
||||
elapsed_ms = elapsed.as_millis(),
|
||||
"made tenant directory contents durable"
|
||||
);
|
||||
}
|
||||
|
||||
// Initialize up failpoints support
|
||||
@@ -402,7 +398,9 @@ fn start_pageserver(
|
||||
ControllerUpcallClient::new(conf, &shutdown_pageserver),
|
||||
conf,
|
||||
);
|
||||
deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
|
||||
if let Some(deletion_workers) = deletion_workers {
|
||||
deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
|
||||
}
|
||||
|
||||
// Up to this point no significant I/O has been done: this should have been fast. Record
|
||||
// duration prior to starting I/O intensive phase of startup.
|
||||
|
||||
@@ -69,7 +69,6 @@ pub struct PageServerConf {
|
||||
pub wal_redo_timeout: Duration,
|
||||
|
||||
pub superuser: String,
|
||||
pub locale: String,
|
||||
|
||||
pub page_cache_size: usize,
|
||||
pub max_file_descriptors: usize,
|
||||
@@ -179,9 +178,6 @@ pub struct PageServerConf {
|
||||
|
||||
/// Direct IO settings
|
||||
pub virtual_file_io_mode: virtual_file::IoMode,
|
||||
|
||||
/// Optionally disable disk syncs (unsafe!)
|
||||
pub no_sync: bool,
|
||||
}
|
||||
|
||||
/// Token for authentication to safekeepers
|
||||
@@ -302,7 +298,6 @@ impl PageServerConf {
|
||||
wait_lsn_timeout,
|
||||
wal_redo_timeout,
|
||||
superuser,
|
||||
locale,
|
||||
page_cache_size,
|
||||
max_file_descriptors,
|
||||
pg_distrib_dir,
|
||||
@@ -337,7 +332,6 @@ impl PageServerConf {
|
||||
concurrent_tenant_size_logical_size_queries,
|
||||
virtual_file_io_engine,
|
||||
tenant_config,
|
||||
no_sync,
|
||||
} = config_toml;
|
||||
|
||||
let mut conf = PageServerConf {
|
||||
@@ -350,7 +344,6 @@ impl PageServerConf {
|
||||
wait_lsn_timeout,
|
||||
wal_redo_timeout,
|
||||
superuser,
|
||||
locale,
|
||||
page_cache_size,
|
||||
max_file_descriptors,
|
||||
http_auth_type,
|
||||
@@ -416,7 +409,6 @@ impl PageServerConf {
|
||||
.map(crate::l0_flush::L0FlushConfig::from)
|
||||
.unwrap_or_default(),
|
||||
virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
|
||||
no_sync: no_sync.unwrap_or(false),
|
||||
};
|
||||
|
||||
// ------------------------------------------------------------
|
||||
|
||||
@@ -14,7 +14,6 @@ use itertools::Itertools as _;
|
||||
use pageserver_api::models::TenantState;
|
||||
use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
|
||||
use reqwest::Url;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime};
|
||||
@@ -36,62 +35,12 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
|
||||
/// upload attempts.
|
||||
type RawMetric = (MetricsKey, (EventType, u64));
|
||||
|
||||
/// The new serializable metrics format
|
||||
#[derive(Serialize, Deserialize)]
|
||||
struct NewMetricsRoot {
|
||||
version: usize,
|
||||
metrics: Vec<NewRawMetric>,
|
||||
}
|
||||
|
||||
impl NewMetricsRoot {
|
||||
pub fn is_v2_metrics(json_value: &serde_json::Value) -> bool {
|
||||
if let Some(ver) = json_value.get("version") {
|
||||
if let Some(2) = ver.as_u64() {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/// The new serializable metrics format
|
||||
#[derive(Serialize)]
|
||||
struct NewMetricsRefRoot<'a> {
|
||||
version: usize,
|
||||
metrics: &'a [NewRawMetric],
|
||||
}
|
||||
|
||||
impl<'a> NewMetricsRefRoot<'a> {
|
||||
fn new(metrics: &'a [NewRawMetric]) -> Self {
|
||||
Self {
|
||||
version: 2,
|
||||
metrics,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The new serializable metrics format
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||
struct NewRawMetric {
|
||||
key: MetricsKey,
|
||||
kind: EventType,
|
||||
value: u64,
|
||||
// TODO: add generation field and check against generations
|
||||
}
|
||||
|
||||
impl NewRawMetric {
|
||||
#[cfg(test)]
|
||||
fn to_kv_pair(&self) -> (MetricsKey, NewRawMetric) {
|
||||
(self.key, self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
/// Caches the [`RawMetric`]s
|
||||
///
|
||||
/// In practice, during startup, last sent values are stored here to be used in calculating new
|
||||
/// ones. After successful uploading, the cached values are updated to cache. This used to be used
|
||||
/// for deduplication, but that is no longer needed.
|
||||
type Cache = HashMap<MetricsKey, NewRawMetric>;
|
||||
type Cache = HashMap<MetricsKey, (EventType, u64)>;
|
||||
|
||||
pub async fn run(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -282,14 +231,11 @@ async fn restore_and_reschedule(
|
||||
// collect_all_metrics
|
||||
let earlier_metric_at = found_some
|
||||
.iter()
|
||||
.map(|item| item.kind.recorded_at())
|
||||
.map(|(_, (et, _))| et.recorded_at())
|
||||
.copied()
|
||||
.next();
|
||||
|
||||
let cached = found_some
|
||||
.into_iter()
|
||||
.map(|item| (item.key, item))
|
||||
.collect::<Cache>();
|
||||
let cached = found_some.into_iter().collect::<Cache>();
|
||||
|
||||
(cached, earlier_metric_at)
|
||||
}
|
||||
|
||||
@@ -2,33 +2,11 @@ use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::consumption_metrics::NewMetricsRefRoot;
|
||||
|
||||
use super::{NewMetricsRoot, NewRawMetric, RawMetric};
|
||||
|
||||
pub(super) fn read_metrics_from_serde_value(
|
||||
json_value: serde_json::Value,
|
||||
) -> anyhow::Result<Vec<NewRawMetric>> {
|
||||
if NewMetricsRoot::is_v2_metrics(&json_value) {
|
||||
let root = serde_json::from_value::<NewMetricsRoot>(json_value)?;
|
||||
Ok(root.metrics)
|
||||
} else {
|
||||
let all_metrics = serde_json::from_value::<Vec<RawMetric>>(json_value)?;
|
||||
let all_metrics = all_metrics
|
||||
.into_iter()
|
||||
.map(|(key, (event_type, value))| NewRawMetric {
|
||||
key,
|
||||
kind: event_type,
|
||||
value,
|
||||
})
|
||||
.collect();
|
||||
Ok(all_metrics)
|
||||
}
|
||||
}
|
||||
use super::RawMetric;
|
||||
|
||||
pub(super) async fn read_metrics_from_disk(
|
||||
path: Arc<Utf8PathBuf>,
|
||||
) -> anyhow::Result<Vec<NewRawMetric>> {
|
||||
) -> anyhow::Result<Vec<RawMetric>> {
|
||||
// do not add context to each error, callsite will log with full path
|
||||
let span = tracing::Span::current();
|
||||
tokio::task::spawn_blocking(move || {
|
||||
@@ -42,8 +20,7 @@ pub(super) async fn read_metrics_from_disk(
|
||||
|
||||
let mut file = std::fs::File::open(&*path)?;
|
||||
let reader = std::io::BufReader::new(&mut file);
|
||||
let json_value = serde_json::from_reader::<_, serde_json::Value>(reader)?;
|
||||
read_metrics_from_serde_value(json_value)
|
||||
anyhow::Ok(serde_json::from_reader::<_, Vec<RawMetric>>(reader)?)
|
||||
})
|
||||
.await
|
||||
.context("read metrics join error")
|
||||
@@ -86,7 +63,7 @@ fn scan_and_delete_with_same_prefix(path: &Utf8Path) -> std::io::Result<()> {
|
||||
}
|
||||
|
||||
pub(super) async fn flush_metrics_to_disk(
|
||||
current_metrics: &Arc<Vec<NewRawMetric>>,
|
||||
current_metrics: &Arc<Vec<RawMetric>>,
|
||||
path: &Arc<Utf8PathBuf>,
|
||||
) -> anyhow::Result<()> {
|
||||
use std::io::Write;
|
||||
@@ -116,11 +93,8 @@ pub(super) async fn flush_metrics_to_disk(
|
||||
// write out all of the raw metrics, to be read out later on restart as cached values
|
||||
{
|
||||
let mut writer = std::io::BufWriter::new(&mut tempfile);
|
||||
serde_json::to_writer(
|
||||
&mut writer,
|
||||
&NewMetricsRefRoot::new(current_metrics.as_ref()),
|
||||
)
|
||||
.context("serialize metrics")?;
|
||||
serde_json::to_writer(&mut writer, &*current_metrics)
|
||||
.context("serialize metrics")?;
|
||||
writer
|
||||
.into_inner()
|
||||
.map_err(|_| anyhow::anyhow!("flushing metrics failed"))?;
|
||||
|
||||
@@ -9,7 +9,7 @@ use utils::{
|
||||
lsn::Lsn,
|
||||
};
|
||||
|
||||
use super::{Cache, NewRawMetric};
|
||||
use super::{Cache, RawMetric};
|
||||
|
||||
/// Name of the metric, used by `MetricsKey` factory methods and `deserialize_cached_events`
|
||||
/// instead of static str.
|
||||
@@ -64,21 +64,11 @@ impl MetricsKey {
|
||||
struct AbsoluteValueFactory(MetricsKey);
|
||||
|
||||
impl AbsoluteValueFactory {
|
||||
#[cfg(test)]
|
||||
const fn at_old_format(self, time: DateTime<Utc>, val: u64) -> super::RawMetric {
|
||||
const fn at(self, time: DateTime<Utc>, val: u64) -> RawMetric {
|
||||
let key = self.0;
|
||||
(key, (EventType::Absolute { time }, val))
|
||||
}
|
||||
|
||||
const fn at(self, time: DateTime<Utc>, val: u64) -> NewRawMetric {
|
||||
let key = self.0;
|
||||
NewRawMetric {
|
||||
key,
|
||||
kind: EventType::Absolute { time },
|
||||
value: val,
|
||||
}
|
||||
}
|
||||
|
||||
fn key(&self) -> &MetricsKey {
|
||||
&self.0
|
||||
}
|
||||
@@ -94,28 +84,7 @@ impl IncrementalValueFactory {
|
||||
prev_end: DateTime<Utc>,
|
||||
up_to: DateTime<Utc>,
|
||||
val: u64,
|
||||
) -> NewRawMetric {
|
||||
let key = self.0;
|
||||
// cannot assert prev_end < up_to because these are realtime clock based
|
||||
let when = EventType::Incremental {
|
||||
start_time: prev_end,
|
||||
stop_time: up_to,
|
||||
};
|
||||
NewRawMetric {
|
||||
key,
|
||||
kind: when,
|
||||
value: val,
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::wrong_self_convention)]
|
||||
#[cfg(test)]
|
||||
const fn from_until_old_format(
|
||||
self,
|
||||
prev_end: DateTime<Utc>,
|
||||
up_to: DateTime<Utc>,
|
||||
val: u64,
|
||||
) -> super::RawMetric {
|
||||
) -> RawMetric {
|
||||
let key = self.0;
|
||||
// cannot assert prev_end < up_to because these are realtime clock based
|
||||
let when = EventType::Incremental {
|
||||
@@ -216,7 +185,7 @@ pub(super) async fn collect_all_metrics(
|
||||
tenant_manager: &Arc<TenantManager>,
|
||||
cached_metrics: &Cache,
|
||||
ctx: &RequestContext,
|
||||
) -> Vec<NewRawMetric> {
|
||||
) -> Vec<RawMetric> {
|
||||
use pageserver_api::models::TenantState;
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
@@ -251,11 +220,11 @@ pub(super) async fn collect_all_metrics(
|
||||
res
|
||||
}
|
||||
|
||||
async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<NewRawMetric>
|
||||
async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<RawMetric>
|
||||
where
|
||||
S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::Tenant>)>,
|
||||
{
|
||||
let mut current_metrics: Vec<NewRawMetric> = Vec::new();
|
||||
let mut current_metrics: Vec<RawMetric> = Vec::new();
|
||||
|
||||
let mut tenants = std::pin::pin!(tenants);
|
||||
|
||||
@@ -322,7 +291,7 @@ impl TenantSnapshot {
|
||||
tenant_id: TenantId,
|
||||
now: DateTime<Utc>,
|
||||
cached: &Cache,
|
||||
metrics: &mut Vec<NewRawMetric>,
|
||||
metrics: &mut Vec<RawMetric>,
|
||||
) {
|
||||
let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size);
|
||||
|
||||
@@ -333,9 +302,9 @@ impl TenantSnapshot {
|
||||
let mut synthetic_size = self.synthetic_size;
|
||||
|
||||
if synthetic_size == 0 {
|
||||
if let Some(item) = cached.get(factory.key()) {
|
||||
// use the latest value from previous session, TODO: check generation number
|
||||
synthetic_size = item.value;
|
||||
if let Some((_, value)) = cached.get(factory.key()) {
|
||||
// use the latest value from previous session
|
||||
synthetic_size = *value;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -412,36 +381,37 @@ impl TimelineSnapshot {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
now: DateTime<Utc>,
|
||||
metrics: &mut Vec<NewRawMetric>,
|
||||
metrics: &mut Vec<RawMetric>,
|
||||
cache: &Cache,
|
||||
) {
|
||||
let timeline_written_size = u64::from(self.last_record_lsn);
|
||||
|
||||
let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
|
||||
|
||||
let last_stop_time = cache.get(written_size_delta_key.key()).map(|item| {
|
||||
item.kind
|
||||
.incremental_timerange()
|
||||
.expect("never create EventType::Absolute for written_size_delta")
|
||||
.end
|
||||
});
|
||||
let last_stop_time = cache
|
||||
.get(written_size_delta_key.key())
|
||||
.map(|(until, _val)| {
|
||||
until
|
||||
.incremental_timerange()
|
||||
.expect("never create EventType::Absolute for written_size_delta")
|
||||
.end
|
||||
});
|
||||
|
||||
let written_size_now =
|
||||
let (key, written_size_now) =
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
|
||||
|
||||
// by default, use the last sent written_size as the basis for
|
||||
// calculating the delta. if we don't yet have one, use the load time value.
|
||||
let prev: (DateTime<Utc>, u64) = cache
|
||||
.get(&written_size_now.key)
|
||||
.map(|item| {
|
||||
let prev = cache
|
||||
.get(&key)
|
||||
.map(|(prev_at, prev)| {
|
||||
// use the prev time from our last incremental update, or default to latest
|
||||
// absolute update on the first round.
|
||||
let prev_at = item
|
||||
.kind
|
||||
let prev_at = prev_at
|
||||
.absolute_time()
|
||||
.expect("never create EventType::Incremental for written_size");
|
||||
let prev_at = last_stop_time.unwrap_or(prev_at);
|
||||
(*prev_at, item.value)
|
||||
(*prev_at, *prev)
|
||||
})
|
||||
.unwrap_or_else(|| {
|
||||
// if we don't have a previous point of comparison, compare to the load time
|
||||
@@ -452,28 +422,24 @@ impl TimelineSnapshot {
|
||||
|
||||
let up_to = now;
|
||||
|
||||
if let Some(delta) = written_size_now.value.checked_sub(prev.1) {
|
||||
if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
|
||||
let key_value = written_size_delta_key.from_until(prev.0, up_to, delta);
|
||||
// written_size_delta
|
||||
metrics.push(key_value);
|
||||
// written_size
|
||||
metrics.push(written_size_now);
|
||||
metrics.push((key, written_size_now));
|
||||
} else {
|
||||
// the cached value was ahead of us, report zero until we've caught up
|
||||
metrics.push(written_size_delta_key.from_until(prev.0, up_to, 0));
|
||||
// the cached value was ahead of us, report the same until we've caught up
|
||||
metrics.push(NewRawMetric {
|
||||
key: written_size_now.key,
|
||||
kind: written_size_now.kind,
|
||||
value: prev.1,
|
||||
});
|
||||
metrics.push((key, (written_size_now.0, prev.1)));
|
||||
}
|
||||
|
||||
{
|
||||
let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id);
|
||||
let current_or_previous = self
|
||||
.current_exact_logical_size
|
||||
.or_else(|| cache.get(factory.key()).map(|item| item.value));
|
||||
.or_else(|| cache.get(factory.key()).map(|(_, val)| *val));
|
||||
|
||||
if let Some(size) = current_or_previous {
|
||||
metrics.push(factory.at(now, size));
|
||||
@@ -486,4 +452,4 @@ impl TimelineSnapshot {
|
||||
mod tests;
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) use tests::{metric_examples, metric_examples_old};
|
||||
pub(crate) use tests::metric_examples;
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
use crate::consumption_metrics::RawMetric;
|
||||
|
||||
use super::*;
|
||||
use std::collections::HashMap;
|
||||
|
||||
@@ -52,9 +50,9 @@ fn startup_collected_timeline_metrics_second_round() {
|
||||
let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
let cache = HashMap::from([MetricsKey::written_size(tenant_id, timeline_id)
|
||||
.at(before, disk_consistent_lsn.0)
|
||||
.to_kv_pair()]);
|
||||
let cache = HashMap::from([
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
|
||||
]);
|
||||
|
||||
let snap = TimelineSnapshot {
|
||||
loaded_at: (disk_consistent_lsn, init),
|
||||
@@ -91,13 +89,9 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
|
||||
let mut metrics = Vec::new();
|
||||
let cache = HashMap::from([
|
||||
// at t=before was the last time the last_record_lsn changed
|
||||
MetricsKey::written_size(tenant_id, timeline_id)
|
||||
.at(before, disk_consistent_lsn.0)
|
||||
.to_kv_pair(),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
|
||||
// end time of this event is used for the next ones
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id)
|
||||
.from_until(before, just_before, 0)
|
||||
.to_kv_pair(),
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, just_before, 0),
|
||||
]);
|
||||
|
||||
let snap = TimelineSnapshot {
|
||||
@@ -144,17 +138,13 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
|
||||
};
|
||||
|
||||
let mut cache = HashMap::from([
|
||||
MetricsKey::written_size(tenant_id, timeline_id)
|
||||
.at(before_restart, 100)
|
||||
.to_kv_pair(),
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id)
|
||||
.from_until(
|
||||
way_before,
|
||||
before_restart,
|
||||
// not taken into account, but the timestamps are important
|
||||
999_999_999,
|
||||
)
|
||||
.to_kv_pair(),
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(before_restart, 100),
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
|
||||
way_before,
|
||||
before_restart,
|
||||
// not taken into account, but the timestamps are important
|
||||
999_999_999,
|
||||
),
|
||||
]);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
@@ -173,7 +163,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
|
||||
);
|
||||
|
||||
// now if we cache these metrics, and re-run while "still in recovery"
|
||||
cache.extend(metrics.drain(..).map(|x| x.to_kv_pair()));
|
||||
cache.extend(metrics.drain(..));
|
||||
|
||||
// "still in recovery", because our snapshot did not change
|
||||
snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache);
|
||||
@@ -204,14 +194,14 @@ fn post_restart_current_exact_logical_size_uses_cached() {
|
||||
current_exact_logical_size: None,
|
||||
};
|
||||
|
||||
let cache = HashMap::from([MetricsKey::timeline_logical_size(tenant_id, timeline_id)
|
||||
.at(before_restart, 100)
|
||||
.to_kv_pair()]);
|
||||
let cache = HashMap::from([
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(before_restart, 100)
|
||||
]);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
|
||||
|
||||
metrics.retain(|item| item.key.metric == Name::LogicalSize);
|
||||
metrics.retain(|(key, _)| key.metric == Name::LogicalSize);
|
||||
|
||||
assert_eq!(
|
||||
metrics,
|
||||
@@ -234,9 +224,7 @@ fn post_restart_synthetic_size_uses_cached_if_available() {
|
||||
let before_restart = DateTime::<Utc>::from(now - std::time::Duration::from_secs(5 * 60));
|
||||
let now = DateTime::<Utc>::from(now);
|
||||
|
||||
let cached = HashMap::from([MetricsKey::synthetic_size(tenant_id)
|
||||
.at(before_restart, 1000)
|
||||
.to_kv_pair()]);
|
||||
let cached = HashMap::from([MetricsKey::synthetic_size(tenant_id).at(before_restart, 1000)]);
|
||||
|
||||
let mut metrics = Vec::new();
|
||||
ts.to_metrics(tenant_id, now, &cached, &mut metrics);
|
||||
@@ -290,29 +278,12 @@ fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
|
||||
times
|
||||
}
|
||||
|
||||
pub(crate) const fn metric_examples_old(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
now: DateTime<Utc>,
|
||||
before: DateTime<Utc>,
|
||||
) -> [RawMetric; 6] {
|
||||
[
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0),
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id)
|
||||
.from_until_old_format(before, now, 0),
|
||||
MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0),
|
||||
MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0),
|
||||
MetricsKey::resident_size(tenant_id).at_old_format(now, 0),
|
||||
MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1),
|
||||
]
|
||||
}
|
||||
|
||||
pub(crate) const fn metric_examples(
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
now: DateTime<Utc>,
|
||||
before: DateTime<Utc>,
|
||||
) -> [NewRawMetric; 6] {
|
||||
) -> [RawMetric; 6] {
|
||||
[
|
||||
MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
|
||||
MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
|
||||
|
||||
@@ -7,7 +7,7 @@ use tokio::io::AsyncWriteExt;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
|
||||
use super::{metrics::Name, Cache, MetricsKey, NewRawMetric, RawMetric};
|
||||
use super::{metrics::Name, Cache, MetricsKey, RawMetric};
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
/// How the metrics from pageserver are identified.
|
||||
@@ -24,7 +24,7 @@ pub(super) async fn upload_metrics_http(
|
||||
client: &reqwest::Client,
|
||||
metric_collection_endpoint: &reqwest::Url,
|
||||
cancel: &CancellationToken,
|
||||
metrics: &[NewRawMetric],
|
||||
metrics: &[RawMetric],
|
||||
cached_metrics: &mut Cache,
|
||||
idempotency_keys: &[IdempotencyKey<'_>],
|
||||
) -> anyhow::Result<()> {
|
||||
@@ -53,8 +53,8 @@ pub(super) async fn upload_metrics_http(
|
||||
|
||||
match res {
|
||||
Ok(()) => {
|
||||
for item in chunk {
|
||||
cached_metrics.insert(item.key, item.clone());
|
||||
for (curr_key, curr_val) in chunk {
|
||||
cached_metrics.insert(*curr_key, *curr_val);
|
||||
}
|
||||
uploaded += chunk.len();
|
||||
}
|
||||
@@ -86,7 +86,7 @@ pub(super) async fn upload_metrics_bucket(
|
||||
client: &GenericRemoteStorage,
|
||||
cancel: &CancellationToken,
|
||||
node_id: &str,
|
||||
metrics: &[NewRawMetric],
|
||||
metrics: &[RawMetric],
|
||||
idempotency_keys: &[IdempotencyKey<'_>],
|
||||
) -> anyhow::Result<()> {
|
||||
if metrics.is_empty() {
|
||||
@@ -140,16 +140,16 @@ pub(super) async fn upload_metrics_bucket(
|
||||
/// across different metrics sinks), and must have the same length as input.
|
||||
fn serialize_in_chunks<'a>(
|
||||
chunk_size: usize,
|
||||
input: &'a [NewRawMetric],
|
||||
input: &'a [RawMetric],
|
||||
idempotency_keys: &'a [IdempotencyKey<'a>],
|
||||
) -> impl ExactSizeIterator<Item = Result<(&'a [NewRawMetric], bytes::Bytes), serde_json::Error>> + 'a
|
||||
) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
|
||||
{
|
||||
use bytes::BufMut;
|
||||
|
||||
assert_eq!(input.len(), idempotency_keys.len());
|
||||
|
||||
struct Iter<'a> {
|
||||
inner: std::slice::Chunks<'a, NewRawMetric>,
|
||||
inner: std::slice::Chunks<'a, RawMetric>,
|
||||
idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>,
|
||||
chunk_size: usize,
|
||||
|
||||
@@ -160,7 +160,7 @@ fn serialize_in_chunks<'a>(
|
||||
}
|
||||
|
||||
impl<'a> Iterator for Iter<'a> {
|
||||
type Item = Result<(&'a [NewRawMetric], bytes::Bytes), serde_json::Error>;
|
||||
type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let chunk = self.inner.next()?;
|
||||
@@ -269,58 +269,6 @@ impl RawMetricExt for RawMetric {
|
||||
}
|
||||
}
|
||||
|
||||
impl RawMetricExt for NewRawMetric {
|
||||
fn as_event(&self, key: &IdempotencyKey<'_>) -> Event<Ids, Name> {
|
||||
let MetricsKey {
|
||||
metric,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = self.key;
|
||||
|
||||
let kind = self.kind;
|
||||
let value = self.value;
|
||||
|
||||
Event {
|
||||
kind,
|
||||
metric,
|
||||
idempotency_key: key.to_string(),
|
||||
value,
|
||||
extra: Ids {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn update_in_place(&self, event: &mut Event<Ids, Name>, key: &IdempotencyKey<'_>) {
|
||||
use std::fmt::Write;
|
||||
|
||||
let MetricsKey {
|
||||
metric,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
} = self.key;
|
||||
|
||||
let kind = self.kind;
|
||||
let value = self.value;
|
||||
|
||||
*event = Event {
|
||||
kind,
|
||||
metric,
|
||||
idempotency_key: {
|
||||
event.idempotency_key.clear();
|
||||
write!(event.idempotency_key, "{key}").unwrap();
|
||||
std::mem::take(&mut event.idempotency_key)
|
||||
},
|
||||
value,
|
||||
extra: Ids {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait KeyGen<'a> {
|
||||
fn generate(&self) -> IdempotencyKey<'a>;
|
||||
}
|
||||
@@ -433,10 +381,6 @@ async fn upload(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::consumption_metrics::{
|
||||
disk_cache::read_metrics_from_serde_value, NewMetricsRefRoot,
|
||||
};
|
||||
|
||||
use super::*;
|
||||
use chrono::{DateTime, Utc};
|
||||
use once_cell::sync::Lazy;
|
||||
@@ -529,49 +473,23 @@ mod tests {
|
||||
let idempotency_key = consumption_metrics::IdempotencyKey::for_tests(*SAMPLES_NOW, "1", 0);
|
||||
let examples = examples.into_iter().zip(metric_samples());
|
||||
|
||||
for ((line, expected), item) in examples {
|
||||
for ((line, expected), (key, (kind, value))) in examples {
|
||||
let e = consumption_metrics::Event {
|
||||
kind: item.kind,
|
||||
metric: item.key.metric,
|
||||
kind,
|
||||
metric: key.metric,
|
||||
idempotency_key: idempotency_key.to_string(),
|
||||
value: item.value,
|
||||
value,
|
||||
extra: Ids {
|
||||
tenant_id: item.key.tenant_id,
|
||||
timeline_id: item.key.timeline_id,
|
||||
tenant_id: key.tenant_id,
|
||||
timeline_id: key.timeline_id,
|
||||
},
|
||||
};
|
||||
let actual = serde_json::to_string(&e).unwrap();
|
||||
assert_eq!(
|
||||
expected, actual,
|
||||
"example for {:?} from line {line}",
|
||||
item.kind
|
||||
);
|
||||
assert_eq!(expected, actual, "example for {kind:?} from line {line}");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn disk_format_upgrade() {
|
||||
let old_samples_json = serde_json::to_value(metric_samples_old()).unwrap();
|
||||
let new_samples =
|
||||
serde_json::to_value(NewMetricsRefRoot::new(metric_samples().as_ref())).unwrap();
|
||||
let upgraded_samples = read_metrics_from_serde_value(old_samples_json).unwrap();
|
||||
let new_samples = read_metrics_from_serde_value(new_samples).unwrap();
|
||||
assert_eq!(upgraded_samples, new_samples);
|
||||
}
|
||||
|
||||
fn metric_samples_old() -> [RawMetric; 6] {
|
||||
let tenant_id = TenantId::from_array([0; 16]);
|
||||
let timeline_id = TimelineId::from_array([0xff; 16]);
|
||||
|
||||
let before = DateTime::parse_from_rfc3339("2023-09-14T00:00:00.123456789Z")
|
||||
.unwrap()
|
||||
.into();
|
||||
let [now, before] = [*SAMPLES_NOW, before];
|
||||
|
||||
super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before)
|
||||
}
|
||||
|
||||
fn metric_samples() -> [NewRawMetric; 6] {
|
||||
fn metric_samples() -> [RawMetric; 6] {
|
||||
let tenant_id = TenantId::from_array([0; 16]);
|
||||
let timeline_id = TimelineId::from_array([0xff; 16]);
|
||||
|
||||
|
||||
@@ -618,11 +618,13 @@ impl DeletionQueue {
|
||||
/// Caller may use the returned object to construct clients with new_client.
|
||||
/// Caller should tokio::spawn the background() members of the two worker objects returned:
|
||||
/// we don't spawn those inside new() so that the caller can use their runtime/spans of choice.
|
||||
///
|
||||
/// If remote_storage is None, then the returned workers will also be None.
|
||||
pub fn new<C>(
|
||||
remote_storage: GenericRemoteStorage,
|
||||
controller_upcall_client: Option<C>,
|
||||
conf: &'static PageServerConf,
|
||||
) -> (Self, DeletionQueueWorkers<C>)
|
||||
) -> (Self, Option<DeletionQueueWorkers<C>>)
|
||||
where
|
||||
C: ControlPlaneGenerationsApi + Send + Sync,
|
||||
{
|
||||
@@ -654,7 +656,7 @@ impl DeletionQueue {
|
||||
},
|
||||
cancel: cancel.clone(),
|
||||
},
|
||||
DeletionQueueWorkers {
|
||||
Some(DeletionQueueWorkers {
|
||||
frontend: ListWriter::new(conf, rx, backend_tx, cancel.clone()),
|
||||
backend: Validator::new(
|
||||
conf,
|
||||
@@ -665,7 +667,7 @@ impl DeletionQueue {
|
||||
cancel.clone(),
|
||||
),
|
||||
executor: Deleter::new(remote_storage, executor_rx, cancel.clone()),
|
||||
},
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -694,7 +696,7 @@ impl DeletionQueue {
|
||||
mod test {
|
||||
use camino::Utf8Path;
|
||||
use hex_literal::hex;
|
||||
use pageserver_api::{key::Key, shard::ShardIndex, upcall_api::ReAttachResponseTenant};
|
||||
use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
|
||||
use std::{io::ErrorKind, time::Duration};
|
||||
use tracing::info;
|
||||
|
||||
@@ -703,6 +705,7 @@ mod test {
|
||||
|
||||
use crate::{
|
||||
controller_upcall_client::RetryForeverError,
|
||||
repository::Key,
|
||||
tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
|
||||
};
|
||||
|
||||
@@ -740,7 +743,9 @@ mod test {
|
||||
);
|
||||
|
||||
tracing::debug!("Spawning worker for new queue queue");
|
||||
let worker_join = workers.spawn_with(&tokio::runtime::Handle::current());
|
||||
let worker_join = workers
|
||||
.unwrap()
|
||||
.spawn_with(&tokio::runtime::Handle::current());
|
||||
|
||||
let old_worker_join = std::mem::replace(&mut self.worker_join, worker_join);
|
||||
let old_deletion_queue = std::mem::replace(&mut self.deletion_queue, deletion_queue);
|
||||
@@ -851,6 +856,7 @@ mod test {
|
||||
harness.conf,
|
||||
);
|
||||
|
||||
let worker = worker.unwrap();
|
||||
let worker_join = worker.spawn_with(&tokio::runtime::Handle::current());
|
||||
|
||||
Ok(TestSetup {
|
||||
|
||||
@@ -37,7 +37,6 @@ use pageserver_api::models::TenantShardLocation;
|
||||
use pageserver_api::models::TenantShardSplitRequest;
|
||||
use pageserver_api::models::TenantShardSplitResponse;
|
||||
use pageserver_api::models::TenantSorting;
|
||||
use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::TimelineArchivalConfigRequest;
|
||||
use pageserver_api::models::TimelineCreateRequestMode;
|
||||
use pageserver_api::models::TimelinesInfoAndOffloaded;
|
||||
@@ -81,7 +80,6 @@ use crate::tenant::size::ModelInputs;
|
||||
use crate::tenant::storage_layer::LayerAccessStatsReset;
|
||||
use crate::tenant::storage_layer::LayerName;
|
||||
use crate::tenant::timeline::offload::offload_timeline;
|
||||
use crate::tenant::timeline::offload::OffloadError;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
use crate::tenant::timeline::CompactionError;
|
||||
use crate::tenant::timeline::Timeline;
|
||||
@@ -296,9 +294,6 @@ impl From<GetActiveTenantError> for ApiError {
|
||||
GetActiveTenantError::Broken(reason) => {
|
||||
ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
|
||||
}
|
||||
GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
|
||||
ApiError::ShuttingDown
|
||||
}
|
||||
GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
|
||||
GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
|
||||
GetActiveTenantError::NotFound(gte) => gte.into(),
|
||||
@@ -2002,19 +1997,14 @@ async fn timeline_offload_handler(
|
||||
"timeline has attached children".into(),
|
||||
));
|
||||
}
|
||||
if let (false, reason) = timeline.can_offload() {
|
||||
if !timeline.can_offload() {
|
||||
return Err(ApiError::PreconditionFailed(
|
||||
format!("Timeline::can_offload() check failed: {}", reason) .into(),
|
||||
"Timeline::can_offload() returned false".into(),
|
||||
));
|
||||
}
|
||||
offload_timeline(&tenant, &timeline)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
match e {
|
||||
OffloadError::Cancelled => ApiError::ResourceUnavailable("Timeline shutting down".into()),
|
||||
_ => ApiError::InternalServerError(anyhow!(e))
|
||||
}
|
||||
})?;
|
||||
.map_err(ApiError::InternalServerError)?;
|
||||
|
||||
json_response(StatusCode::OK, ())
|
||||
}
|
||||
@@ -2070,7 +2060,6 @@ async fn timeline_checkpoint_handler(
|
||||
.map_err(|e|
|
||||
match e {
|
||||
CompactionError::ShuttingDown => ApiError::ShuttingDown,
|
||||
CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
|
||||
CompactionError::Other(e) => ApiError::InternalServerError(e)
|
||||
}
|
||||
)?;
|
||||
@@ -2169,21 +2158,6 @@ async fn timeline_detach_ancestor_handler(
|
||||
let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
|
||||
let ctx = &ctx;
|
||||
|
||||
// Flush the upload queues of all timelines before detaching ancestor. We do the same thing again
|
||||
// during shutdown. This early upload ensures the pageserver does not need to upload too many
|
||||
// things and creates downtime during timeline reloads.
|
||||
for timeline in tenant.list_timelines() {
|
||||
timeline
|
||||
.remote_client
|
||||
.wait_completion()
|
||||
.await
|
||||
.map_err(|e| {
|
||||
ApiError::PreconditionFailed(format!("cannot drain upload queue: {e}").into())
|
||||
})?;
|
||||
}
|
||||
|
||||
tracing::info!("all timeline upload queues are drained");
|
||||
|
||||
let timeline = tenant.get_timeline(timeline_id, true)?;
|
||||
|
||||
let progress = timeline
|
||||
@@ -2258,13 +2232,13 @@ async fn getpage_at_lsn_handler(
|
||||
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
|
||||
let state = get_state(&request);
|
||||
|
||||
struct Key(pageserver_api::key::Key);
|
||||
struct Key(crate::repository::Key);
|
||||
|
||||
impl std::str::FromStr for Key {
|
||||
type Err = anyhow::Error;
|
||||
|
||||
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
|
||||
pageserver_api::key::Key::from_hex(s).map(Key)
|
||||
crate::repository::Key::from_hex(s).map(Key)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -12,7 +12,6 @@ use pageserver_api::key::rel_block_to_key;
|
||||
use tokio::io::{AsyncRead, AsyncReadExt};
|
||||
use tokio_tar::Archive;
|
||||
use tracing::*;
|
||||
use wal_decoder::models::InterpretedWalRecord;
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
@@ -20,6 +19,8 @@ use crate::metrics::WAL_INGEST;
|
||||
use crate::pgdatadir_mapping::*;
|
||||
use crate::tenant::Timeline;
|
||||
use crate::walingest::WalIngest;
|
||||
use crate::walrecord::decode_wal_record;
|
||||
use crate::walrecord::DecodedWALRecord;
|
||||
use pageserver_api::reltag::{RelTag, SlruKind};
|
||||
use postgres_ffi::pg_constants;
|
||||
use postgres_ffi::relfile_utils::*;
|
||||
@@ -312,15 +313,11 @@ async fn import_wal(
|
||||
let mut modification = tline.begin_modification(last_lsn);
|
||||
while last_lsn <= endpoint {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
tline.get_shard_identity(),
|
||||
lsn,
|
||||
tline.pg_version,
|
||||
)?;
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
|
||||
|
||||
walingest
|
||||
.ingest_record(interpreted, &mut modification, ctx)
|
||||
.ingest_record(decoded, lsn, &mut modification, ctx)
|
||||
.await?;
|
||||
WAL_INGEST.records_committed.inc();
|
||||
|
||||
@@ -457,15 +454,10 @@ pub async fn import_wal_from_tar(
|
||||
let mut modification = tline.begin_modification(last_lsn);
|
||||
while last_lsn <= end_lsn {
|
||||
if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
|
||||
let interpreted = InterpretedWalRecord::from_bytes_filtered(
|
||||
recdata,
|
||||
tline.get_shard_identity(),
|
||||
lsn,
|
||||
tline.pg_version,
|
||||
)?;
|
||||
|
||||
let mut decoded = DecodedWALRecord::default();
|
||||
decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
|
||||
walingest
|
||||
.ingest_record(interpreted, &mut modification, ctx)
|
||||
.ingest_record(decoded, lsn, &mut modification, ctx)
|
||||
.await?;
|
||||
modification.commit(ctx).await?;
|
||||
last_lsn = lsn;
|
||||
|
||||
@@ -24,6 +24,7 @@ pub mod metrics;
|
||||
pub mod page_cache;
|
||||
pub mod page_service;
|
||||
pub mod pgdatadir_mapping;
|
||||
pub mod repository;
|
||||
pub mod span;
|
||||
pub(crate) mod statvfs;
|
||||
pub mod task_mgr;
|
||||
@@ -31,6 +32,7 @@ pub mod tenant;
|
||||
pub mod utilization;
|
||||
pub mod virtual_file;
|
||||
pub mod walingest;
|
||||
pub mod walrecord;
|
||||
pub mod walredo;
|
||||
|
||||
use camino::Utf8Path;
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
//! The Page Service listens for client connections and serves their GetPage@LSN
|
||||
//! requests.
|
||||
|
||||
use anyhow::{bail, Context};
|
||||
use anyhow::Context;
|
||||
use async_compression::tokio::write::GzipEncoder;
|
||||
use bytes::Buf;
|
||||
use futures::FutureExt;
|
||||
use itertools::Itertools;
|
||||
use once_cell::sync::OnceCell;
|
||||
use pageserver_api::models::TenantState;
|
||||
use pageserver_api::models::{
|
||||
@@ -1081,7 +1080,6 @@ impl PageServerHandler {
|
||||
prev_lsn: Option<Lsn>,
|
||||
full_backup: bool,
|
||||
gzip: bool,
|
||||
replica: bool,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), QueryError>
|
||||
where
|
||||
@@ -1134,7 +1132,6 @@ impl PageServerHandler {
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1157,7 +1154,6 @@ impl PageServerHandler {
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1174,7 +1170,6 @@ impl PageServerHandler {
|
||||
lsn,
|
||||
prev_lsn,
|
||||
full_backup,
|
||||
replica,
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
@@ -1222,222 +1217,6 @@ impl PageServerHandler {
|
||||
}
|
||||
}
|
||||
|
||||
/// `basebackup tenant timeline [lsn] [--gzip] [--replica]`
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
struct BaseBackupCmd {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Option<Lsn>,
|
||||
gzip: bool,
|
||||
replica: bool,
|
||||
}
|
||||
|
||||
/// `fullbackup tenant timeline [lsn] [prev_lsn]`
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
struct FullBackupCmd {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Option<Lsn>,
|
||||
prev_lsn: Option<Lsn>,
|
||||
}
|
||||
|
||||
/// `pagestream_v2 tenant timeline`
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
struct PageStreamCmd {
|
||||
tenant_id: TenantId,
|
||||
timeline_id: TimelineId,
|
||||
}
|
||||
|
||||
/// `lease lsn tenant timeline lsn`
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
struct LeaseLsnCmd {
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline_id: TimelineId,
|
||||
lsn: Lsn,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
enum PageServiceCmd {
|
||||
Set,
|
||||
PageStream(PageStreamCmd),
|
||||
BaseBackup(BaseBackupCmd),
|
||||
FullBackup(FullBackupCmd),
|
||||
LeaseLsn(LeaseLsnCmd),
|
||||
}
|
||||
|
||||
impl PageStreamCmd {
|
||||
fn parse(query: &str) -> anyhow::Result<Self> {
|
||||
let parameters = query.split_whitespace().collect_vec();
|
||||
if parameters.len() != 2 {
|
||||
bail!(
|
||||
"invalid number of parameters for pagestream command: {}",
|
||||
query
|
||||
);
|
||||
}
|
||||
let tenant_id = TenantId::from_str(parameters[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
|
||||
let timeline_id = TimelineId::from_str(parameters[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
|
||||
Ok(Self {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl FullBackupCmd {
|
||||
fn parse(query: &str) -> anyhow::Result<Self> {
|
||||
let parameters = query.split_whitespace().collect_vec();
|
||||
if parameters.len() < 2 || parameters.len() > 4 {
|
||||
bail!(
|
||||
"invalid number of parameters for basebackup command: {}",
|
||||
query
|
||||
);
|
||||
}
|
||||
let tenant_id = TenantId::from_str(parameters[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
|
||||
let timeline_id = TimelineId::from_str(parameters[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
|
||||
// The caller is responsible for providing correct lsn and prev_lsn.
|
||||
let lsn = if let Some(lsn_str) = parameters.get(2) {
|
||||
Some(
|
||||
Lsn::from_str(lsn_str)
|
||||
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let prev_lsn = if let Some(prev_lsn_str) = parameters.get(3) {
|
||||
Some(
|
||||
Lsn::from_str(prev_lsn_str)
|
||||
.with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
Ok(Self {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
prev_lsn,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl BaseBackupCmd {
|
||||
fn parse(query: &str) -> anyhow::Result<Self> {
|
||||
let parameters = query.split_whitespace().collect_vec();
|
||||
if parameters.len() < 2 {
|
||||
bail!(
|
||||
"invalid number of parameters for basebackup command: {}",
|
||||
query
|
||||
);
|
||||
}
|
||||
let tenant_id = TenantId::from_str(parameters[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
|
||||
let timeline_id = TimelineId::from_str(parameters[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
|
||||
let lsn;
|
||||
let flags_parse_from;
|
||||
if let Some(maybe_lsn) = parameters.get(2) {
|
||||
if *maybe_lsn == "latest" {
|
||||
lsn = None;
|
||||
flags_parse_from = 3;
|
||||
} else if maybe_lsn.starts_with("--") {
|
||||
lsn = None;
|
||||
flags_parse_from = 2;
|
||||
} else {
|
||||
lsn = Some(
|
||||
Lsn::from_str(maybe_lsn)
|
||||
.with_context(|| format!("Failed to parse lsn from {maybe_lsn}"))?,
|
||||
);
|
||||
flags_parse_from = 3;
|
||||
}
|
||||
} else {
|
||||
lsn = None;
|
||||
flags_parse_from = 2;
|
||||
}
|
||||
|
||||
let mut gzip = false;
|
||||
let mut replica = false;
|
||||
|
||||
for ¶m in ¶meters[flags_parse_from..] {
|
||||
match param {
|
||||
"--gzip" => {
|
||||
if gzip {
|
||||
bail!("duplicate parameter for basebackup command: {param}")
|
||||
}
|
||||
gzip = true
|
||||
}
|
||||
"--replica" => {
|
||||
if replica {
|
||||
bail!("duplicate parameter for basebackup command: {param}")
|
||||
}
|
||||
replica = true
|
||||
}
|
||||
_ => bail!("invalid parameter for basebackup command: {param}"),
|
||||
}
|
||||
}
|
||||
Ok(Self {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
gzip,
|
||||
replica,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl LeaseLsnCmd {
|
||||
fn parse(query: &str) -> anyhow::Result<Self> {
|
||||
let parameters = query.split_whitespace().collect_vec();
|
||||
if parameters.len() != 3 {
|
||||
bail!(
|
||||
"invalid number of parameters for lease lsn command: {}",
|
||||
query
|
||||
);
|
||||
}
|
||||
let tenant_shard_id = TenantShardId::from_str(parameters[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
|
||||
let timeline_id = TimelineId::from_str(parameters[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
|
||||
let lsn = Lsn::from_str(parameters[2])
|
||||
.with_context(|| format!("Failed to parse lsn from {}", parameters[2]))?;
|
||||
Ok(Self {
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl PageServiceCmd {
|
||||
fn parse(query: &str) -> anyhow::Result<Self> {
|
||||
let query = query.trim();
|
||||
let Some((cmd, other)) = query.split_once(' ') else {
|
||||
bail!("cannot parse query: {query}")
|
||||
};
|
||||
match cmd.to_ascii_lowercase().as_str() {
|
||||
"pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse(other)?)),
|
||||
"basebackup" => Ok(Self::BaseBackup(BaseBackupCmd::parse(other)?)),
|
||||
"fullbackup" => Ok(Self::FullBackup(FullBackupCmd::parse(other)?)),
|
||||
"lease" => {
|
||||
let Some((cmd2, other)) = other.split_once(' ') else {
|
||||
bail!("invalid lease command: {cmd}");
|
||||
};
|
||||
let cmd2 = cmd2.to_ascii_lowercase();
|
||||
if cmd2 == "lsn" {
|
||||
Ok(Self::LeaseLsn(LeaseLsnCmd::parse(other)?))
|
||||
} else {
|
||||
bail!("invalid lease command: {cmd}");
|
||||
}
|
||||
}
|
||||
"set" => Ok(Self::Set),
|
||||
_ => Err(anyhow::anyhow!("unsupported command {cmd} in {query}")),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<IO> postgres_backend::Handler<IO> for PageServerHandler
|
||||
where
|
||||
IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
|
||||
@@ -1494,137 +1273,201 @@ where
|
||||
fail::fail_point!("ps::connection-start::process-query");
|
||||
|
||||
let ctx = self.connection_ctx.attached_child();
|
||||
debug!("process query {query_string}");
|
||||
let query = PageServiceCmd::parse(query_string)?;
|
||||
match query {
|
||||
PageServiceCmd::PageStream(PageStreamCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
}) => {
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::PageStreamV2)
|
||||
.inc();
|
||||
|
||||
self.handle_pagerequests(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
PagestreamProtocolVersion::V2,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
debug!("process query {query_string:?}");
|
||||
let parts = query_string.split_whitespace().collect::<Vec<_>>();
|
||||
if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) {
|
||||
if params.len() != 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for pagestream command"
|
||||
)));
|
||||
}
|
||||
PageServiceCmd::BaseBackup(BaseBackupCmd {
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::PageStreamV2)
|
||||
.inc();
|
||||
|
||||
self.handle_pagerequests(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
gzip,
|
||||
replica,
|
||||
}) => {
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
PagestreamProtocolVersion::V2,
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
} else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
|
||||
if params.len() < 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for basebackup command"
|
||||
)));
|
||||
}
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::Basebackup)
|
||||
.inc();
|
||||
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
|
||||
let res = async {
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
None,
|
||||
false,
|
||||
gzip,
|
||||
replica,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
Result::<(), QueryError>::Ok(())
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::Basebackup)
|
||||
.inc();
|
||||
|
||||
let (lsn, gzip) = match (params.get(2), params.get(3)) {
|
||||
(None, _) => (None, false),
|
||||
(Some(&"--gzip"), _) => (None, true),
|
||||
(Some(lsn_str), gzip_str_opt) => {
|
||||
let lsn = Lsn::from_str(lsn_str)
|
||||
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?;
|
||||
let gzip = match gzip_str_opt {
|
||||
Some(&"--gzip") => true,
|
||||
None => false,
|
||||
Some(third_param) => {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"Parameter in position 3 unknown {third_param}",
|
||||
)))
|
||||
}
|
||||
};
|
||||
(Some(lsn), gzip)
|
||||
}
|
||||
.await;
|
||||
metric_recording.observe(&res);
|
||||
res?;
|
||||
}
|
||||
// same as basebackup, but result includes relational data as well
|
||||
PageServiceCmd::FullBackup(FullBackupCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
prev_lsn,
|
||||
}) => {
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
};
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::Fullbackup)
|
||||
.inc();
|
||||
|
||||
// Check that the timeline exists
|
||||
let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
|
||||
let res = async {
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
prev_lsn,
|
||||
true,
|
||||
false,
|
||||
None,
|
||||
false,
|
||||
gzip,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
Result::<(), QueryError>::Ok(())
|
||||
}
|
||||
PageServiceCmd::Set => {
|
||||
// important because psycopg2 executes "SET datestyle TO 'ISO'"
|
||||
// on connect
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
.await;
|
||||
metric_recording.observe(&res);
|
||||
res?;
|
||||
}
|
||||
// same as basebackup, but result includes relational data as well
|
||||
else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
|
||||
if params.len() < 2 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number for fullbackup command"
|
||||
)));
|
||||
}
|
||||
PageServiceCmd::LeaseLsn(LeaseLsnCmd {
|
||||
tenant_shard_id,
|
||||
|
||||
let tenant_id = TenantId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
// The caller is responsible for providing correct lsn and prev_lsn.
|
||||
let lsn = if let Some(lsn_str) = params.get(2) {
|
||||
Some(
|
||||
Lsn::from_str(lsn_str)
|
||||
.with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let prev_lsn = if let Some(prev_lsn_str) = params.get(3) {
|
||||
Some(
|
||||
Lsn::from_str(prev_lsn_str)
|
||||
.with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
|
||||
)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
self.check_permission(Some(tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::Fullbackup)
|
||||
.inc();
|
||||
|
||||
// Check that the timeline exists
|
||||
self.handle_basebackup_request(
|
||||
pgb,
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn,
|
||||
}) => {
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_shard_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::LeaseLsn)
|
||||
.inc();
|
||||
|
||||
match self
|
||||
.handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
|
||||
.await
|
||||
{
|
||||
Ok(()) => {
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?
|
||||
}
|
||||
Err(e) => {
|
||||
error!("error obtaining lsn lease for {lsn}: {e:?}");
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?
|
||||
}
|
||||
};
|
||||
prev_lsn,
|
||||
true,
|
||||
false,
|
||||
&ctx,
|
||||
)
|
||||
.await?;
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.to_ascii_lowercase().starts_with("set ") {
|
||||
// important because psycopg2 executes "SET datestyle TO 'ISO'"
|
||||
// on connect
|
||||
pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
|
||||
} else if query_string.starts_with("lease lsn ") {
|
||||
let params = &parts[2..];
|
||||
if params.len() != 3 {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"invalid param number {} for lease lsn command",
|
||||
params.len()
|
||||
)));
|
||||
}
|
||||
|
||||
let tenant_shard_id = TenantShardId::from_str(params[0])
|
||||
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
|
||||
let timeline_id = TimelineId::from_str(params[1])
|
||||
.with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
|
||||
|
||||
tracing::Span::current()
|
||||
.record("tenant_id", field::display(tenant_shard_id))
|
||||
.record("timeline_id", field::display(timeline_id));
|
||||
|
||||
self.check_permission(Some(tenant_shard_id.tenant_id))?;
|
||||
|
||||
COMPUTE_COMMANDS_COUNTERS
|
||||
.for_command(ComputeCommandKind::LeaseLsn)
|
||||
.inc();
|
||||
|
||||
// The caller is responsible for providing correct lsn.
|
||||
let lsn = Lsn::from_str(params[2])
|
||||
.with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
|
||||
|
||||
match self
|
||||
.handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
|
||||
.await
|
||||
{
|
||||
Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
|
||||
Err(e) => {
|
||||
error!("error obtaining lsn lease for {lsn}: {e:?}");
|
||||
pgb.write_message_noflush(&BeMessage::ErrorResponse(
|
||||
&e.to_string(),
|
||||
Some(e.pg_error_code()),
|
||||
))?
|
||||
}
|
||||
};
|
||||
} else {
|
||||
return Err(QueryError::Other(anyhow::anyhow!(
|
||||
"unknown command {query_string}"
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -1673,181 +1516,3 @@ fn set_tracing_field_shard_id(timeline: &Timeline) {
|
||||
);
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use utils::shard::ShardCount;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn pageservice_cmd_parse() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let cmd =
|
||||
PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id} {timeline_id}")).unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::PageStream(PageStreamCmd {
|
||||
tenant_id,
|
||||
timeline_id
|
||||
})
|
||||
);
|
||||
let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id}")).unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::BaseBackup(BaseBackupCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn: None,
|
||||
gzip: false,
|
||||
replica: false
|
||||
})
|
||||
);
|
||||
let cmd =
|
||||
PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} --gzip")).unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::BaseBackup(BaseBackupCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn: None,
|
||||
gzip: true,
|
||||
replica: false
|
||||
})
|
||||
);
|
||||
let cmd =
|
||||
PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} latest")).unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::BaseBackup(BaseBackupCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn: None,
|
||||
gzip: false,
|
||||
replica: false
|
||||
})
|
||||
);
|
||||
let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} 0/16ABCDE"))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::BaseBackup(BaseBackupCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
|
||||
gzip: false,
|
||||
replica: false
|
||||
})
|
||||
);
|
||||
let cmd = PageServiceCmd::parse(&format!(
|
||||
"basebackup {tenant_id} {timeline_id} --replica --gzip"
|
||||
))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::BaseBackup(BaseBackupCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn: None,
|
||||
gzip: true,
|
||||
replica: true
|
||||
})
|
||||
);
|
||||
let cmd = PageServiceCmd::parse(&format!(
|
||||
"basebackup {tenant_id} {timeline_id} 0/16ABCDE --replica --gzip"
|
||||
))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::BaseBackup(BaseBackupCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
|
||||
gzip: true,
|
||||
replica: true
|
||||
})
|
||||
);
|
||||
let cmd = PageServiceCmd::parse(&format!("fullbackup {tenant_id} {timeline_id}")).unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::FullBackup(FullBackupCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn: None,
|
||||
prev_lsn: None
|
||||
})
|
||||
);
|
||||
let cmd = PageServiceCmd::parse(&format!(
|
||||
"fullbackup {tenant_id} {timeline_id} 0/16ABCDE 0/16ABCDF"
|
||||
))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::FullBackup(FullBackupCmd {
|
||||
tenant_id,
|
||||
timeline_id,
|
||||
lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
|
||||
prev_lsn: Some(Lsn::from_str("0/16ABCDF").unwrap()),
|
||||
})
|
||||
);
|
||||
let tenant_shard_id = TenantShardId::unsharded(tenant_id);
|
||||
let cmd = PageServiceCmd::parse(&format!(
|
||||
"lease lsn {tenant_shard_id} {timeline_id} 0/16ABCDE"
|
||||
))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::LeaseLsn(LeaseLsnCmd {
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
lsn: Lsn::from_str("0/16ABCDE").unwrap(),
|
||||
})
|
||||
);
|
||||
let tenant_shard_id = TenantShardId::split(&tenant_shard_id, ShardCount(8))[1];
|
||||
let cmd = PageServiceCmd::parse(&format!(
|
||||
"lease lsn {tenant_shard_id} {timeline_id} 0/16ABCDE"
|
||||
))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
cmd,
|
||||
PageServiceCmd::LeaseLsn(LeaseLsnCmd {
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
lsn: Lsn::from_str("0/16ABCDE").unwrap(),
|
||||
})
|
||||
);
|
||||
let cmd = PageServiceCmd::parse("set a = b").unwrap();
|
||||
assert_eq!(cmd, PageServiceCmd::Set);
|
||||
let cmd = PageServiceCmd::parse("SET foo").unwrap();
|
||||
assert_eq!(cmd, PageServiceCmd::Set);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn pageservice_cmd_err_handling() {
|
||||
let tenant_id = TenantId::generate();
|
||||
let timeline_id = TimelineId::generate();
|
||||
let cmd = PageServiceCmd::parse("unknown_command");
|
||||
assert!(cmd.is_err());
|
||||
let cmd = PageServiceCmd::parse("pagestream_v2");
|
||||
assert!(cmd.is_err());
|
||||
let cmd = PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id}xxx"));
|
||||
assert!(cmd.is_err());
|
||||
let cmd = PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id}xxx {timeline_id}xxx"));
|
||||
assert!(cmd.is_err());
|
||||
let cmd = PageServiceCmd::parse(&format!(
|
||||
"basebackup {tenant_id} {timeline_id} --gzip --gzip"
|
||||
));
|
||||
assert!(cmd.is_err());
|
||||
let cmd = PageServiceCmd::parse(&format!(
|
||||
"basebackup {tenant_id} {timeline_id} --gzip --unknown"
|
||||
));
|
||||
assert!(cmd.is_err());
|
||||
let cmd = PageServiceCmd::parse(&format!(
|
||||
"basebackup {tenant_id} {timeline_id} --gzip 0/16ABCDE"
|
||||
));
|
||||
assert!(cmd.is_err());
|
||||
let cmd = PageServiceCmd::parse(&format!("lease {tenant_id} {timeline_id} gzip 0/16ABCDE"));
|
||||
assert!(cmd.is_err());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,14 +7,14 @@
|
||||
//! Clarify that)
|
||||
//!
|
||||
use super::tenant::{PageReconstructError, Timeline};
|
||||
use crate::aux_file;
|
||||
use crate::context::RequestContext;
|
||||
use crate::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::{aux_file, repository::*};
|
||||
use anyhow::{ensure, Context};
|
||||
use bytes::{Buf, Bytes, BytesMut};
|
||||
use enum_map::Enum;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::key::{
|
||||
dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
|
||||
relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
|
||||
@@ -22,10 +22,7 @@ use pageserver_api::key::{
|
||||
CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
|
||||
};
|
||||
use pageserver_api::keyspace::SparseKeySpace;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
|
||||
use pageserver_api::shard::ShardIdentity;
|
||||
use pageserver_api::value::Value;
|
||||
use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
|
||||
use postgres_ffi::BLCKSZ;
|
||||
use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
|
||||
@@ -39,13 +36,12 @@ use tracing::{debug, trace, warn};
|
||||
use utils::bin_ser::DeserializeError;
|
||||
use utils::pausable_failpoint;
|
||||
use utils::{bin_ser::BeSer, lsn::Lsn};
|
||||
use wal_decoder::serialized_batch::SerializedValueBatch;
|
||||
|
||||
/// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
|
||||
pub const MAX_AUX_FILE_DELTAS: usize = 1024;
|
||||
|
||||
/// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached.
|
||||
pub const MAX_AUX_FILE_V2_DELTAS: usize = 16;
|
||||
pub const MAX_AUX_FILE_V2_DELTAS: usize = 64;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum LsnForTimestamp {
|
||||
@@ -172,11 +168,12 @@ impl Timeline {
|
||||
tline: self,
|
||||
pending_lsns: Vec::new(),
|
||||
pending_metadata_pages: HashMap::new(),
|
||||
pending_data_batch: None,
|
||||
pending_data_pages: Vec::new(),
|
||||
pending_zero_data_pages: Default::default(),
|
||||
pending_deletions: Vec::new(),
|
||||
pending_nblocks: 0,
|
||||
pending_directory_entries: Vec::new(),
|
||||
pending_metadata_bytes: 0,
|
||||
pending_bytes: 0,
|
||||
lsn,
|
||||
}
|
||||
}
|
||||
@@ -1026,14 +1023,21 @@ pub struct DatadirModification<'a> {
|
||||
|
||||
/// Data writes, ready to be flushed into an ephemeral layer. See [`Self::is_data_key`] for
|
||||
/// which keys are stored here.
|
||||
pending_data_batch: Option<SerializedValueBatch>,
|
||||
pending_data_pages: Vec<(CompactKey, Lsn, usize, Value)>,
|
||||
|
||||
// Sometimes during ingest, for example when extending a relation, we would like to write a zero page. However,
|
||||
// if we encounter a write from postgres in the same wal record, we will drop this entry.
|
||||
//
|
||||
// Unlike other 'pending' fields, this does not last until the next call to commit(): it is flushed
|
||||
// at the end of each wal record, and all these writes implicitly are at lsn Self::lsn
|
||||
pending_zero_data_pages: HashSet<CompactKey>,
|
||||
|
||||
/// For special "directory" keys that store key-value maps, track the size of the map
|
||||
/// if it was updated in this modification.
|
||||
pending_directory_entries: Vec<(DirectoryKind, usize)>,
|
||||
|
||||
/// An **approximation** of how many metadata bytes will be written to the EphemeralFile.
|
||||
pending_metadata_bytes: usize,
|
||||
/// An **approximation** of how large our EphemeralFile write will be when committed.
|
||||
pending_bytes: usize,
|
||||
}
|
||||
|
||||
impl<'a> DatadirModification<'a> {
|
||||
@@ -1048,17 +1052,11 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
pub(crate) fn approx_pending_bytes(&self) -> usize {
|
||||
self.pending_data_batch
|
||||
.as_ref()
|
||||
.map_or(0, |b| b.buffer_size())
|
||||
+ self.pending_metadata_bytes
|
||||
self.pending_bytes
|
||||
}
|
||||
|
||||
pub(crate) fn has_dirty_data(&self) -> bool {
|
||||
!self
|
||||
.pending_data_batch
|
||||
.as_ref()
|
||||
.map_or(true, |b| b.is_empty())
|
||||
pub(crate) fn has_dirty_data_pages(&self) -> bool {
|
||||
(!self.pending_data_pages.is_empty()) || (!self.pending_zero_data_pages.is_empty())
|
||||
}
|
||||
|
||||
/// Set the current lsn
|
||||
@@ -1070,6 +1068,9 @@ impl<'a> DatadirModification<'a> {
|
||||
self.lsn
|
||||
);
|
||||
|
||||
// If we are advancing LSN, then state from previous wal record should have been flushed.
|
||||
assert!(self.pending_zero_data_pages.is_empty());
|
||||
|
||||
if lsn > self.lsn {
|
||||
self.pending_lsns.push(self.lsn);
|
||||
self.lsn = lsn;
|
||||
@@ -1144,107 +1145,6 @@ impl<'a> DatadirModification<'a> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Creates a relation if it is not already present.
|
||||
/// Returns the current size of the relation
|
||||
pub(crate) async fn create_relation_if_required(
|
||||
&mut self,
|
||||
rel: RelTag,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<u32, PageReconstructError> {
|
||||
// Get current size and put rel creation if rel doesn't exist
|
||||
//
|
||||
// NOTE: we check the cache first even though get_rel_exists and get_rel_size would
|
||||
// check the cache too. This is because eagerly checking the cache results in
|
||||
// less work overall and 10% better performance. It's more work on cache miss
|
||||
// but cache miss is rare.
|
||||
if let Some(nblocks) = self.tline.get_cached_rel_size(&rel, self.get_lsn()) {
|
||||
Ok(nblocks)
|
||||
} else if !self
|
||||
.tline
|
||||
.get_rel_exists(rel, Version::Modified(self), ctx)
|
||||
.await?
|
||||
{
|
||||
// create it with 0 size initially, the logic below will extend it
|
||||
self.put_rel_creation(rel, 0, ctx)
|
||||
.await
|
||||
.context("Relation Error")?;
|
||||
Ok(0)
|
||||
} else {
|
||||
self.tline
|
||||
.get_rel_size(rel, Version::Modified(self), ctx)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a block number for a relation (which represents a newly written block),
|
||||
/// the previous block count of the relation, and the shard info, find the gaps
|
||||
/// that were created by the newly written block if any.
|
||||
fn find_gaps(
|
||||
rel: RelTag,
|
||||
blkno: u32,
|
||||
previous_nblocks: u32,
|
||||
shard: &ShardIdentity,
|
||||
) -> Option<KeySpace> {
|
||||
let mut key = rel_block_to_key(rel, blkno);
|
||||
let mut gap_accum = None;
|
||||
|
||||
for gap_blkno in previous_nblocks..blkno {
|
||||
key.field6 = gap_blkno;
|
||||
|
||||
if shard.get_shard_number(&key) != shard.number {
|
||||
continue;
|
||||
}
|
||||
|
||||
gap_accum
|
||||
.get_or_insert_with(KeySpaceAccum::new)
|
||||
.add_key(key);
|
||||
}
|
||||
|
||||
gap_accum.map(|accum| accum.to_keyspace())
|
||||
}
|
||||
|
||||
pub async fn ingest_batch(
|
||||
&mut self,
|
||||
mut batch: SerializedValueBatch,
|
||||
// TODO(vlad): remove this argument and replace the shard check with is_key_local
|
||||
shard: &ShardIdentity,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut gaps_at_lsns = Vec::default();
|
||||
|
||||
for meta in batch.metadata.iter() {
|
||||
let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?;
|
||||
let new_nblocks = blkno + 1;
|
||||
|
||||
let old_nblocks = self.create_relation_if_required(rel, ctx).await?;
|
||||
if new_nblocks > old_nblocks {
|
||||
self.put_rel_extend(rel, new_nblocks, ctx).await?;
|
||||
}
|
||||
|
||||
if let Some(gaps) = Self::find_gaps(rel, blkno, old_nblocks, shard) {
|
||||
gaps_at_lsns.push((gaps, meta.lsn()));
|
||||
}
|
||||
}
|
||||
|
||||
if !gaps_at_lsns.is_empty() {
|
||||
batch.zero_gaps(gaps_at_lsns);
|
||||
}
|
||||
|
||||
match self.pending_data_batch.as_mut() {
|
||||
Some(pending_batch) => {
|
||||
pending_batch.extend(batch);
|
||||
}
|
||||
None if !batch.is_empty() => {
|
||||
self.pending_data_batch = Some(batch);
|
||||
}
|
||||
None => {
|
||||
// Nothing to initialize the batch with
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Put a new page version that can be constructed from a WAL record
|
||||
///
|
||||
/// NOTE: this will *not* implicitly extend the relation, if the page is beyond the
|
||||
@@ -1327,13 +1227,8 @@ impl<'a> DatadirModification<'a> {
|
||||
self.lsn
|
||||
);
|
||||
}
|
||||
|
||||
let batch = self
|
||||
.pending_data_batch
|
||||
.get_or_insert_with(SerializedValueBatch::default);
|
||||
|
||||
batch.put(key.to_compact(), Value::Image(ZERO_PAGE.clone()), self.lsn);
|
||||
|
||||
self.pending_zero_data_pages.insert(key.to_compact());
|
||||
self.pending_bytes += ZERO_PAGE.len();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1351,16 +1246,19 @@ impl<'a> DatadirModification<'a> {
|
||||
self.lsn
|
||||
);
|
||||
}
|
||||
|
||||
let batch = self
|
||||
.pending_data_batch
|
||||
.get_or_insert_with(SerializedValueBatch::default);
|
||||
|
||||
batch.put(key.to_compact(), Value::Image(ZERO_PAGE.clone()), self.lsn);
|
||||
|
||||
self.pending_zero_data_pages.insert(key.to_compact());
|
||||
self.pending_bytes += ZERO_PAGE.len();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Call this at the end of each WAL record.
|
||||
pub(crate) fn on_record_end(&mut self) {
|
||||
let pending_zero_data_pages = std::mem::take(&mut self.pending_zero_data_pages);
|
||||
for key in pending_zero_data_pages {
|
||||
self.put_data(key, Value::Image(ZERO_PAGE.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
/// Store a relmapper file (pg_filenode.map) in the repository
|
||||
pub async fn put_relmap_file(
|
||||
&mut self,
|
||||
@@ -1850,17 +1748,12 @@ impl<'a> DatadirModification<'a> {
|
||||
let mut writer = self.tline.writer().await;
|
||||
|
||||
// Flush relation and SLRU data blocks, keep metadata.
|
||||
if let Some(batch) = self.pending_data_batch.take() {
|
||||
tracing::debug!(
|
||||
"Flushing batch with max_lsn={}. Last record LSN is {}",
|
||||
batch.max_lsn,
|
||||
self.tline.get_last_record_lsn()
|
||||
);
|
||||
let pending_data_pages = std::mem::take(&mut self.pending_data_pages);
|
||||
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
writer.put_batch(batch, ctx).await?;
|
||||
}
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
writer.put_batch(pending_data_pages, ctx).await?;
|
||||
self.pending_bytes = 0;
|
||||
|
||||
if pending_nblocks != 0 {
|
||||
writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
|
||||
@@ -1880,6 +1773,9 @@ impl<'a> DatadirModification<'a> {
|
||||
/// All the modifications in this atomic update are stamped by the specified LSN.
|
||||
///
|
||||
pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
|
||||
// Commit should never be called mid-wal-record
|
||||
assert!(self.pending_zero_data_pages.is_empty());
|
||||
|
||||
let mut writer = self.tline.writer().await;
|
||||
|
||||
let pending_nblocks = self.pending_nblocks;
|
||||
@@ -1887,49 +1783,21 @@ impl<'a> DatadirModification<'a> {
|
||||
|
||||
// Ordering: the items in this batch do not need to be in any global order, but values for
|
||||
// a particular Key must be in Lsn order relative to one another. InMemoryLayer relies on
|
||||
// this to do efficient updates to its index. See [`wal_decoder::serialized_batch`] for
|
||||
// more details.
|
||||
// this to do efficient updates to its index.
|
||||
let mut write_batch = std::mem::take(&mut self.pending_data_pages);
|
||||
|
||||
let metadata_batch = {
|
||||
let pending_meta = self
|
||||
.pending_metadata_pages
|
||||
write_batch.extend(
|
||||
self.pending_metadata_pages
|
||||
.drain()
|
||||
.flat_map(|(key, values)| {
|
||||
values
|
||||
.into_iter()
|
||||
.map(move |(lsn, value_size, value)| (key, lsn, value_size, value))
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
}),
|
||||
);
|
||||
|
||||
if pending_meta.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(SerializedValueBatch::from_values(pending_meta))
|
||||
}
|
||||
};
|
||||
|
||||
let data_batch = self.pending_data_batch.take();
|
||||
|
||||
let maybe_batch = match (data_batch, metadata_batch) {
|
||||
(Some(mut data), Some(metadata)) => {
|
||||
data.extend(metadata);
|
||||
Some(data)
|
||||
}
|
||||
(Some(data), None) => Some(data),
|
||||
(None, Some(metadata)) => Some(metadata),
|
||||
(None, None) => None,
|
||||
};
|
||||
|
||||
if let Some(batch) = maybe_batch {
|
||||
tracing::debug!(
|
||||
"Flushing batch with max_lsn={}. Last record LSN is {}",
|
||||
batch.max_lsn,
|
||||
self.tline.get_last_record_lsn()
|
||||
);
|
||||
|
||||
// This bails out on first error without modifying pending_updates.
|
||||
// That's Ok, cf this function's doc comment.
|
||||
writer.put_batch(batch, ctx).await?;
|
||||
if !write_batch.is_empty() {
|
||||
writer.put_batch(write_batch, ctx).await?;
|
||||
}
|
||||
|
||||
if !self.pending_deletions.is_empty() {
|
||||
@@ -1939,9 +1807,6 @@ impl<'a> DatadirModification<'a> {
|
||||
|
||||
self.pending_lsns.push(self.lsn);
|
||||
for pending_lsn in self.pending_lsns.drain(..) {
|
||||
// TODO(vlad): pretty sure the comment below is not valid anymore
|
||||
// and we can call finish write with the latest LSN
|
||||
//
|
||||
// Ideally, we should be able to call writer.finish_write() only once
|
||||
// with the highest LSN. However, the last_record_lsn variable in the
|
||||
// timeline keeps track of the latest LSN and the immediate previous LSN
|
||||
@@ -1957,14 +1822,14 @@ impl<'a> DatadirModification<'a> {
|
||||
writer.update_directory_entries_count(kind, count as u64);
|
||||
}
|
||||
|
||||
self.pending_metadata_bytes = 0;
|
||||
self.pending_bytes = 0;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
self.pending_metadata_pages.len()
|
||||
+ self.pending_data_batch.as_ref().map_or(0, |b| b.len())
|
||||
+ self.pending_data_pages.len()
|
||||
+ self.pending_deletions.len()
|
||||
}
|
||||
|
||||
@@ -2006,10 +1871,11 @@ impl<'a> DatadirModification<'a> {
|
||||
// modifications before ingesting DB create operations, which are the only kind that reads
|
||||
// data pages during ingest.
|
||||
if cfg!(debug_assertions) {
|
||||
assert!(!self
|
||||
.pending_data_batch
|
||||
.as_ref()
|
||||
.map_or(false, |b| b.updates_key(&key)));
|
||||
for (dirty_key, _, _, _) in &self.pending_data_pages {
|
||||
debug_assert!(&key.to_compact() != dirty_key);
|
||||
}
|
||||
|
||||
debug_assert!(!self.pending_zero_data_pages.contains(&key.to_compact()))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2027,10 +1893,18 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
fn put_data(&mut self, key: CompactKey, val: Value) {
|
||||
let batch = self
|
||||
.pending_data_batch
|
||||
.get_or_insert_with(SerializedValueBatch::default);
|
||||
batch.put(key, val, self.lsn);
|
||||
let val_serialized_size = val.serialized_size().unwrap() as usize;
|
||||
|
||||
// If this page was previously zero'd in the same WalRecord, then drop the previous zero page write. This
|
||||
// is an optimization that avoids persisting both the zero page generated by us (e.g. during a relation extend),
|
||||
// and the subsequent postgres-originating write
|
||||
if self.pending_zero_data_pages.remove(&key) {
|
||||
self.pending_bytes -= ZERO_PAGE.len();
|
||||
}
|
||||
|
||||
self.pending_bytes += val_serialized_size;
|
||||
self.pending_data_pages
|
||||
.push((key, self.lsn, val_serialized_size, val))
|
||||
}
|
||||
|
||||
fn put_metadata(&mut self, key: CompactKey, val: Value) {
|
||||
@@ -2038,10 +1912,10 @@ impl<'a> DatadirModification<'a> {
|
||||
// Replace the previous value if it exists at the same lsn
|
||||
if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
|
||||
if *last_lsn == self.lsn {
|
||||
// Update the pending_metadata_bytes contribution from this entry, and update the serialized size in place
|
||||
self.pending_metadata_bytes -= *last_value_ser_size;
|
||||
// Update the pending_bytes contribution from this entry, and update the serialized size in place
|
||||
self.pending_bytes -= *last_value_ser_size;
|
||||
*last_value_ser_size = val.serialized_size().unwrap() as usize;
|
||||
self.pending_metadata_bytes += *last_value_ser_size;
|
||||
self.pending_bytes += *last_value_ser_size;
|
||||
|
||||
// Use the latest value, this replaces any earlier write to the same (key,lsn), such as much
|
||||
// have been generated by synthesized zero page writes prior to the first real write to a page.
|
||||
@@ -2051,12 +1925,8 @@ impl<'a> DatadirModification<'a> {
|
||||
}
|
||||
|
||||
let val_serialized_size = val.serialized_size().unwrap() as usize;
|
||||
self.pending_metadata_bytes += val_serialized_size;
|
||||
self.pending_bytes += val_serialized_size;
|
||||
values.push((self.lsn, val_serialized_size, val));
|
||||
|
||||
if key == CHECKPOINT_KEY.to_compact() {
|
||||
tracing::debug!("Checkpoint key added to pending with size {val_serialized_size}");
|
||||
}
|
||||
}
|
||||
|
||||
fn delete(&mut self, key_range: Range<Key>) {
|
||||
@@ -2165,11 +2035,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use hex_literal::hex;
|
||||
use pageserver_api::{models::ShardParameters, shard::ShardStripeSize};
|
||||
use utils::{
|
||||
id::TimelineId,
|
||||
shard::{ShardCount, ShardNumber},
|
||||
};
|
||||
use utils::id::TimelineId;
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -2223,93 +2089,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn gap_finding() {
|
||||
let rel = RelTag {
|
||||
spcnode: 1663,
|
||||
dbnode: 208101,
|
||||
relnode: 2620,
|
||||
forknum: 0,
|
||||
};
|
||||
let base_blkno = 1;
|
||||
|
||||
let base_key = rel_block_to_key(rel, base_blkno);
|
||||
let before_base_key = rel_block_to_key(rel, base_blkno - 1);
|
||||
|
||||
let shard = ShardIdentity::unsharded();
|
||||
|
||||
let mut previous_nblocks = 0;
|
||||
for i in 0..10 {
|
||||
let crnt_blkno = base_blkno + i;
|
||||
let gaps = DatadirModification::find_gaps(rel, crnt_blkno, previous_nblocks, &shard);
|
||||
|
||||
previous_nblocks = crnt_blkno + 1;
|
||||
|
||||
if i == 0 {
|
||||
// The first block we write is 1, so we should find the gap.
|
||||
assert_eq!(gaps.unwrap(), KeySpace::single(before_base_key..base_key));
|
||||
} else {
|
||||
assert!(gaps.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
// This is an update to an already existing block. No gaps here.
|
||||
let update_blkno = 5;
|
||||
let gaps = DatadirModification::find_gaps(rel, update_blkno, previous_nblocks, &shard);
|
||||
assert!(gaps.is_none());
|
||||
|
||||
// This is an update past the current end block.
|
||||
let after_gap_blkno = 20;
|
||||
let gaps = DatadirModification::find_gaps(rel, after_gap_blkno, previous_nblocks, &shard);
|
||||
|
||||
let gap_start_key = rel_block_to_key(rel, previous_nblocks);
|
||||
let after_gap_key = rel_block_to_key(rel, after_gap_blkno);
|
||||
assert_eq!(
|
||||
gaps.unwrap(),
|
||||
KeySpace::single(gap_start_key..after_gap_key)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sharded_gap_finding() {
|
||||
let rel = RelTag {
|
||||
spcnode: 1663,
|
||||
dbnode: 208101,
|
||||
relnode: 2620,
|
||||
forknum: 0,
|
||||
};
|
||||
|
||||
let first_blkno = 6;
|
||||
|
||||
// This shard will get the even blocks
|
||||
let shard = ShardIdentity::from_params(
|
||||
ShardNumber(0),
|
||||
&ShardParameters {
|
||||
count: ShardCount(2),
|
||||
stripe_size: ShardStripeSize(1),
|
||||
},
|
||||
);
|
||||
|
||||
// Only keys belonging to this shard are considered as gaps.
|
||||
let mut previous_nblocks = 0;
|
||||
let gaps =
|
||||
DatadirModification::find_gaps(rel, first_blkno, previous_nblocks, &shard).unwrap();
|
||||
assert!(!gaps.ranges.is_empty());
|
||||
for gap_range in gaps.ranges {
|
||||
let mut k = gap_range.start;
|
||||
while k != gap_range.end {
|
||||
assert_eq!(shard.get_shard_number(&k), shard.number);
|
||||
k = k.next();
|
||||
}
|
||||
}
|
||||
|
||||
previous_nblocks = first_blkno;
|
||||
|
||||
let update_blkno = 2;
|
||||
let gaps = DatadirModification::find_gaps(rel, update_blkno, previous_nblocks, &shard);
|
||||
assert!(gaps.is_none());
|
||||
}
|
||||
|
||||
/*
|
||||
fn assert_current_logical_size<R: Repository>(timeline: &DatadirTimeline<R>, lsn: Lsn) {
|
||||
let incremental = timeline.get_current_logical_size();
|
||||
|
||||
@@ -1,16 +1,13 @@
|
||||
//! This module defines the value type used by the storage engine.
|
||||
//!
|
||||
//! A [`Value`] represents either a completely new value for one Key ([`Value::Image`]),
|
||||
//! or a "delta" of how to get from previous version of the value to the new one
|
||||
//! ([`Value::WalRecord`]])
|
||||
//!
|
||||
//! Note that the [`Value`] type is used for the permananent storage format, so any
|
||||
//! changes to it must be backwards compatible.
|
||||
|
||||
use crate::record::NeonWalRecord;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use anyhow::Result;
|
||||
use bytes::Bytes;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::ops::AddAssign;
|
||||
use std::time::Duration;
|
||||
|
||||
pub use pageserver_api::key::{Key, KEY_SIZE};
|
||||
|
||||
/// A 'value' stored for a one Key.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub enum Value {
|
||||
/// An Image value contains a full copy of the value
|
||||
@@ -23,12 +20,10 @@ pub enum Value {
|
||||
}
|
||||
|
||||
impl Value {
|
||||
#[inline(always)]
|
||||
pub fn is_image(&self) -> bool {
|
||||
matches!(self, Value::Image(_))
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn will_init(&self) -> bool {
|
||||
match self {
|
||||
Value::Image(_) => true,
|
||||
@@ -38,18 +33,17 @@ impl Value {
|
||||
}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum InvalidInput {
|
||||
pub(crate) enum InvalidInput {
|
||||
TooShortValue,
|
||||
TooShortPostgresRecord,
|
||||
}
|
||||
|
||||
/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
|
||||
/// use this type for querying if a slice looks some particular way.
|
||||
pub struct ValueBytes;
|
||||
pub(crate) struct ValueBytes;
|
||||
|
||||
impl ValueBytes {
|
||||
#[inline(always)]
|
||||
pub fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
|
||||
pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
|
||||
if raw.len() < 12 {
|
||||
return Err(InvalidInput::TooShortValue);
|
||||
}
|
||||
@@ -85,7 +79,6 @@ impl ValueBytes {
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
use bytes::Bytes;
|
||||
use utils::bin_ser::BeSer;
|
||||
|
||||
macro_rules! roundtrip {
|
||||
@@ -236,3 +229,56 @@ mod test {
|
||||
assert!(!ValueBytes::will_init(&expected).unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
/// Result of performing GC
|
||||
///
|
||||
#[derive(Default, Serialize, Debug)]
|
||||
pub struct GcResult {
|
||||
pub layers_total: u64,
|
||||
pub layers_needed_by_cutoff: u64,
|
||||
pub layers_needed_by_pitr: u64,
|
||||
pub layers_needed_by_branches: u64,
|
||||
pub layers_needed_by_leases: u64,
|
||||
pub layers_not_updated: u64,
|
||||
pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
|
||||
|
||||
#[serde(serialize_with = "serialize_duration_as_millis")]
|
||||
pub elapsed: Duration,
|
||||
|
||||
/// The layers which were garbage collected.
|
||||
///
|
||||
/// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
|
||||
/// dropped in tests.
|
||||
#[cfg(feature = "testing")]
|
||||
#[serde(skip)]
|
||||
pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
|
||||
}
|
||||
|
||||
// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
|
||||
fn serialize_duration_as_millis<S>(d: &Duration, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
d.as_millis().serialize(serializer)
|
||||
}
|
||||
|
||||
impl AddAssign for GcResult {
|
||||
fn add_assign(&mut self, other: Self) {
|
||||
self.layers_total += other.layers_total;
|
||||
self.layers_needed_by_pitr += other.layers_needed_by_pitr;
|
||||
self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
|
||||
self.layers_needed_by_branches += other.layers_needed_by_branches;
|
||||
self.layers_needed_by_leases += other.layers_needed_by_leases;
|
||||
self.layers_not_updated += other.layers_not_updated;
|
||||
self.layers_removed += other.layers_removed;
|
||||
|
||||
self.elapsed += other.elapsed;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
let mut other = other;
|
||||
self.doomed_layers.append(&mut other.doomed_layers);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -92,11 +92,11 @@ use crate::metrics::{
|
||||
remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
|
||||
TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
|
||||
};
|
||||
use crate::repository::GcResult;
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::config::LocationMode;
|
||||
use crate::tenant::config::TenantConfOpt;
|
||||
use crate::tenant::gc_result::GcResult;
|
||||
pub use crate::tenant::remote_timeline_client::index::IndexPart;
|
||||
use crate::tenant::remote_timeline_client::remote_initdb_archive_path;
|
||||
use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
|
||||
@@ -160,7 +160,6 @@ pub(crate) mod timeline;
|
||||
pub mod size;
|
||||
|
||||
mod gc_block;
|
||||
mod gc_result;
|
||||
pub(crate) mod throttle;
|
||||
|
||||
pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
@@ -302,13 +301,6 @@ pub struct Tenant {
|
||||
/// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
|
||||
timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>,
|
||||
|
||||
/// Serialize writes of the tenant manifest to remote storage. If there are concurrent operations
|
||||
/// affecting the manifest, such as timeline deletion and timeline offload, they must wait for
|
||||
/// each other (this could be optimized to coalesce writes if necessary).
|
||||
///
|
||||
/// The contents of the Mutex are the last manifest we successfully uploaded
|
||||
tenant_manifest_upload: tokio::sync::Mutex<Option<TenantManifest>>,
|
||||
|
||||
// This mutex prevents creation of new timelines during GC.
|
||||
// Adding yet another mutex (in addition to `timelines`) is needed because holding
|
||||
// `timelines` mutex during all GC iteration
|
||||
@@ -475,10 +467,10 @@ impl WalRedoManager {
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn request_redo(
|
||||
&self,
|
||||
key: pageserver_api::key::Key,
|
||||
key: crate::repository::Key,
|
||||
lsn: Lsn,
|
||||
base_img: Option<(Lsn, bytes::Bytes)>,
|
||||
records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>,
|
||||
records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
) -> Result<bytes::Bytes, walredo::Error> {
|
||||
match self {
|
||||
@@ -521,6 +513,13 @@ pub struct OffloadedTimeline {
|
||||
/// Present for future flattening deliberations.
|
||||
pub archived_at: NaiveDateTime,
|
||||
|
||||
/// Lazily constructed remote client for the timeline
|
||||
///
|
||||
/// If we offload a timeline, we keep around the remote client
|
||||
/// for the duration of the process. If we find it through the
|
||||
/// manifest, we don't construct it up until it's needed (deletion).
|
||||
pub remote_client: Option<Arc<RemoteTimelineClient>>,
|
||||
|
||||
/// Prevent two tasks from deleting the timeline at the same time. If held, the
|
||||
/// timeline is being deleted. If 'true', the timeline has already been deleted.
|
||||
pub delete_progress: TimelineDeleteProgress,
|
||||
@@ -547,6 +546,7 @@ impl OffloadedTimeline {
|
||||
ancestor_retain_lsn,
|
||||
archived_at,
|
||||
|
||||
remote_client: Some(timeline.remote_client.clone()),
|
||||
delete_progress: timeline.delete_progress.clone(),
|
||||
})
|
||||
}
|
||||
@@ -563,6 +563,7 @@ impl OffloadedTimeline {
|
||||
ancestor_timeline_id,
|
||||
ancestor_retain_lsn,
|
||||
archived_at,
|
||||
remote_client: None,
|
||||
delete_progress: TimelineDeleteProgress::default(),
|
||||
}
|
||||
}
|
||||
@@ -624,10 +625,19 @@ impl TimelineOrOffloaded {
|
||||
TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress,
|
||||
}
|
||||
}
|
||||
fn maybe_remote_client(&self) -> Option<Arc<RemoteTimelineClient>> {
|
||||
fn remote_client_maybe_construct(&self, tenant: &Tenant) -> Arc<RemoteTimelineClient> {
|
||||
match self {
|
||||
TimelineOrOffloaded::Timeline(timeline) => Some(timeline.remote_client.clone()),
|
||||
TimelineOrOffloaded::Offloaded(_offloaded) => None,
|
||||
TimelineOrOffloaded::Timeline(timeline) => timeline.remote_client.clone(),
|
||||
TimelineOrOffloaded::Offloaded(offloaded) => match offloaded.remote_client.clone() {
|
||||
Some(remote_client) => remote_client,
|
||||
None => {
|
||||
let remote_client = tenant.build_timeline_client(
|
||||
offloaded.timeline_id,
|
||||
tenant.remote_storage.clone(),
|
||||
);
|
||||
Arc::new(remote_client)
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -739,24 +749,6 @@ pub enum TimelineArchivalError {
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum TenantManifestError {
|
||||
#[error("Remote storage error: {0}")]
|
||||
RemoteStorage(anyhow::Error),
|
||||
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
}
|
||||
|
||||
impl From<TenantManifestError> for TimelineArchivalError {
|
||||
fn from(e: TenantManifestError) -> Self {
|
||||
match e {
|
||||
TenantManifestError::RemoteStorage(e) => Self::Other(e),
|
||||
TenantManifestError::Cancelled => Self::Cancelled,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for TimelineArchivalError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
@@ -1343,15 +1335,14 @@ impl Tenant {
|
||||
)
|
||||
.await?;
|
||||
let (offloaded_add, tenant_manifest) =
|
||||
match remote_timeline_client::download_tenant_manifest(
|
||||
match remote_timeline_client::do_download_tenant_manifest(
|
||||
remote_storage,
|
||||
&self.tenant_shard_id,
|
||||
self.generation,
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok((tenant_manifest, _generation, _manifest_mtime)) => (
|
||||
Ok((tenant_manifest, _generation)) => (
|
||||
format!("{} offloaded", tenant_manifest.offloaded_timelines.len()),
|
||||
tenant_manifest,
|
||||
),
|
||||
@@ -1543,7 +1534,18 @@ impl Tenant {
|
||||
offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter());
|
||||
}
|
||||
if !offloaded_timeline_ids.is_empty() {
|
||||
self.store_tenant_manifest().await?;
|
||||
let manifest = self.tenant_manifest();
|
||||
// TODO: generation support
|
||||
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
|
||||
upload_tenant_manifest(
|
||||
&self.remote_storage,
|
||||
&self.tenant_shard_id,
|
||||
generation,
|
||||
&manifest,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
.map_err(TimelineArchivalError::Other)?;
|
||||
}
|
||||
|
||||
// The local filesystem contents are a cache of what's in the remote IndexPart;
|
||||
@@ -1828,18 +1830,6 @@ impl Tenant {
|
||||
ctx: RequestContext,
|
||||
) -> Result<Arc<Timeline>, TimelineArchivalError> {
|
||||
info!("unoffloading timeline");
|
||||
|
||||
// We activate the timeline below manually, so this must be called on an active timeline.
|
||||
// We expect callers of this function to ensure this.
|
||||
match self.current_state() {
|
||||
TenantState::Activating { .. }
|
||||
| TenantState::Attaching
|
||||
| TenantState::Broken { .. } => {
|
||||
panic!("Timeline expected to be active")
|
||||
}
|
||||
TenantState::Stopping { .. } => return Err(TimelineArchivalError::Cancelled),
|
||||
TenantState::Active => {}
|
||||
}
|
||||
let cancel = self.cancel.clone();
|
||||
|
||||
// Protect against concurrent attempts to use this TimelineId
|
||||
@@ -1924,7 +1914,18 @@ impl Tenant {
|
||||
};
|
||||
|
||||
// Upload new list of offloaded timelines to S3
|
||||
self.store_tenant_manifest().await?;
|
||||
let manifest = self.tenant_manifest();
|
||||
// TODO: generation support
|
||||
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
|
||||
upload_tenant_manifest(
|
||||
&self.remote_storage,
|
||||
&self.tenant_shard_id,
|
||||
generation,
|
||||
&manifest,
|
||||
&cancel,
|
||||
)
|
||||
.await
|
||||
.map_err(TimelineArchivalError::Other)?;
|
||||
|
||||
// Activate the timeline (if it makes sense)
|
||||
if !(timeline.is_broken() || timeline.is_stopping()) {
|
||||
@@ -2493,22 +2494,14 @@ impl Tenant {
|
||||
timelines_to_compact_or_offload = timelines
|
||||
.iter()
|
||||
.filter_map(|(timeline_id, timeline)| {
|
||||
let (is_active, (can_offload, _)) =
|
||||
(timeline.is_active(), timeline.can_offload());
|
||||
let (is_active, can_offload) = (timeline.is_active(), timeline.can_offload());
|
||||
let has_no_unoffloaded_children = {
|
||||
!timelines
|
||||
.iter()
|
||||
.any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id))
|
||||
};
|
||||
let config_allows_offload = self.conf.timeline_offloading
|
||||
|| self
|
||||
.tenant_conf
|
||||
.load()
|
||||
.tenant_conf
|
||||
.timeline_offloading
|
||||
.unwrap_or_default();
|
||||
let can_offload =
|
||||
can_offload && has_no_unoffloaded_children && config_allows_offload;
|
||||
can_offload && has_no_unoffloaded_children && self.conf.timeline_offloading;
|
||||
if (is_active, can_offload) == (false, false) {
|
||||
None
|
||||
} else {
|
||||
@@ -2537,11 +2530,6 @@ impl Tenant {
|
||||
.await
|
||||
.inspect_err(|e| match e {
|
||||
timeline::CompactionError::ShuttingDown => (),
|
||||
timeline::CompactionError::Offload(_) => {
|
||||
// Failures to offload timelines do not trip the circuit breaker, because
|
||||
// they do not do lots of writes the way compaction itself does: it is cheap
|
||||
// to retry, and it would be bad to stop all compaction because of an issue with offloading.
|
||||
}
|
||||
timeline::CompactionError::Other(e) => {
|
||||
self.compaction_circuit_breaker
|
||||
.lock()
|
||||
@@ -2557,7 +2545,8 @@ impl Tenant {
|
||||
if pending_task_left == Some(false) && *can_offload {
|
||||
offload_timeline(self, timeline)
|
||||
.instrument(info_span!("offload_timeline", %timeline_id))
|
||||
.await?;
|
||||
.await
|
||||
.map_err(timeline::CompactionError::Other)?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3133,7 +3122,9 @@ impl Tenant {
|
||||
}
|
||||
}
|
||||
|
||||
let tenant_manifest = self.build_tenant_manifest();
|
||||
let tenant_manifest = self.tenant_manifest();
|
||||
// TODO: generation support
|
||||
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
|
||||
for child_shard in child_shards {
|
||||
tracing::info!(
|
||||
"Uploading tenant manifest for child {}",
|
||||
@@ -3142,7 +3133,7 @@ impl Tenant {
|
||||
upload_tenant_manifest(
|
||||
&self.remote_storage,
|
||||
child_shard,
|
||||
self.generation,
|
||||
generation,
|
||||
&tenant_manifest,
|
||||
&self.cancel,
|
||||
)
|
||||
@@ -3326,8 +3317,7 @@ impl Tenant {
|
||||
.unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
|
||||
}
|
||||
|
||||
/// Generate an up-to-date TenantManifest based on the state of this Tenant.
|
||||
fn build_tenant_manifest(&self) -> TenantManifest {
|
||||
pub(crate) fn tenant_manifest(&self) -> TenantManifest {
|
||||
let timelines_offloaded = self.timelines_offloaded.lock().unwrap();
|
||||
|
||||
let mut timeline_manifests = timelines_offloaded
|
||||
@@ -3535,7 +3525,6 @@ impl Tenant {
|
||||
timelines: Mutex::new(HashMap::new()),
|
||||
timelines_creating: Mutex::new(HashSet::new()),
|
||||
timelines_offloaded: Mutex::new(HashMap::new()),
|
||||
tenant_manifest_upload: Default::default(),
|
||||
gc_cs: tokio::sync::Mutex::new(()),
|
||||
walredo_mgr,
|
||||
remote_storage,
|
||||
@@ -4715,49 +4704,6 @@ impl Tenant {
|
||||
.max()
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
/// Serialize and write the latest TenantManifest to remote storage.
|
||||
pub(crate) async fn store_tenant_manifest(&self) -> Result<(), TenantManifestError> {
|
||||
// Only one manifest write may be done at at time, and the contents of the manifest
|
||||
// must be loaded while holding this lock. This makes it safe to call this function
|
||||
// from anywhere without worrying about colliding updates.
|
||||
let mut guard = tokio::select! {
|
||||
g = self.tenant_manifest_upload.lock() => {
|
||||
g
|
||||
},
|
||||
_ = self.cancel.cancelled() => {
|
||||
return Err(TenantManifestError::Cancelled);
|
||||
}
|
||||
};
|
||||
|
||||
let manifest = self.build_tenant_manifest();
|
||||
if Some(&manifest) == (*guard).as_ref() {
|
||||
// Optimisation: skip uploads that don't change anything.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
upload_tenant_manifest(
|
||||
&self.remote_storage,
|
||||
&self.tenant_shard_id,
|
||||
self.generation,
|
||||
&manifest,
|
||||
&self.cancel,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
if self.cancel.is_cancelled() {
|
||||
TenantManifestError::Cancelled
|
||||
} else {
|
||||
TenantManifestError::RemoteStorage(e)
|
||||
}
|
||||
})?;
|
||||
|
||||
// Store the successfully uploaded manifest, so that future callers can avoid
|
||||
// re-uploading the same thing.
|
||||
*guard = Some(manifest);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Create the cluster temporarily in 'initdbpath' directory inside the repository
|
||||
@@ -4780,12 +4726,10 @@ async fn run_initdb(
|
||||
|
||||
let _permit = INIT_DB_SEMAPHORE.acquire().await;
|
||||
|
||||
let mut initdb_command = tokio::process::Command::new(&initdb_bin_path);
|
||||
initdb_command
|
||||
let initdb_command = tokio::process::Command::new(&initdb_bin_path)
|
||||
.args(["--pgdata", initdb_target_dir.as_ref()])
|
||||
.args(["--username", &conf.superuser])
|
||||
.args(["--encoding", "utf8"])
|
||||
.args(["--locale", &conf.locale])
|
||||
.arg("--no-instructions")
|
||||
.arg("--no-sync")
|
||||
.env_clear()
|
||||
@@ -4795,27 +4739,15 @@ async fn run_initdb(
|
||||
// stdout invocation produces the same output every time, we don't need it
|
||||
.stdout(std::process::Stdio::null())
|
||||
// we would be interested in the stderr output, if there was any
|
||||
.stderr(std::process::Stdio::piped());
|
||||
|
||||
// Before version 14, only the libc provide was available.
|
||||
if pg_version > 14 {
|
||||
// Version 17 brought with it a builtin locale provider which only provides
|
||||
// C and C.UTF-8. While being safer for collation purposes since it is
|
||||
// guaranteed to be consistent throughout a major release, it is also more
|
||||
// performant.
|
||||
let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" };
|
||||
|
||||
initdb_command.args(["--locale-provider", locale_provider]);
|
||||
}
|
||||
|
||||
let initdb_proc = initdb_command.spawn()?;
|
||||
.stderr(std::process::Stdio::piped())
|
||||
.spawn()?;
|
||||
|
||||
// Ideally we'd select here with the cancellation token, but the problem is that
|
||||
// we can't safely terminate initdb: it launches processes of its own, and killing
|
||||
// initdb doesn't kill them. After we return from this function, we want the target
|
||||
// directory to be able to be cleaned up.
|
||||
// See https://github.com/neondatabase/neon/issues/6385
|
||||
let initdb_output = initdb_proc.wait_with_output().await?;
|
||||
let initdb_output = initdb_command.wait_with_output().await?;
|
||||
if !initdb_output.status.success() {
|
||||
return Err(InitdbError::Failed(
|
||||
initdb_output.status,
|
||||
@@ -4874,8 +4806,7 @@ pub(crate) mod harness {
|
||||
use crate::deletion_queue::mock::MockDeletionQueue;
|
||||
use crate::l0_flush::L0FlushConfig;
|
||||
use crate::walredo::apply_neon;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use crate::{repository::Key, walrecord::NeonWalRecord};
|
||||
|
||||
use super::*;
|
||||
use hex_literal::hex;
|
||||
@@ -4922,9 +4853,9 @@ pub(crate) mod harness {
|
||||
image_layer_creation_check_threshold: Some(
|
||||
tenant_conf.image_layer_creation_check_threshold,
|
||||
),
|
||||
switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy),
|
||||
lsn_lease_length: Some(tenant_conf.lsn_lease_length),
|
||||
lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
|
||||
timeline_offloading: Some(tenant_conf.timeline_offloading),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -5145,30 +5076,24 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::keyspace::KeySpaceAccum;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::harness::*;
|
||||
use crate::tenant::timeline::CompactFlags;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use crate::DEFAULT_PG_VERSION;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use hex_literal::hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
|
||||
use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
|
||||
use pageserver_api::value::Value;
|
||||
use pageserver_compaction::helpers::overlaps_with;
|
||||
use rand::{thread_rng, Rng};
|
||||
use storage_layer::PersistentLayerKey;
|
||||
use tests::storage_layer::ValuesReconstructState;
|
||||
use tests::timeline::{GetVectoredError, ShutdownMode};
|
||||
use timeline::DeltaLayerTestDesc;
|
||||
use utils::id::TenantId;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
#[cfg(feature = "testing")]
|
||||
use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
|
||||
#[cfg(feature = "testing")]
|
||||
use timeline::GcInfo;
|
||||
use timeline::{DeltaLayerTestDesc, GcInfo};
|
||||
use utils::id::TenantId;
|
||||
|
||||
static TEST_KEY: Lazy<Key> =
|
||||
Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
|
||||
@@ -7678,7 +7603,23 @@ mod tests {
|
||||
}
|
||||
|
||||
// Check if old layers are removed / new layers have the expected LSN
|
||||
let all_layers = inspect_and_sort(&tline, None).await;
|
||||
let mut all_layers = tline.inspect_historic_layers().await.unwrap();
|
||||
all_layers.sort_by(|k1, k2| {
|
||||
(
|
||||
k1.is_delta,
|
||||
k1.key_range.start,
|
||||
k1.key_range.end,
|
||||
k1.lsn_range.start,
|
||||
k1.lsn_range.end,
|
||||
)
|
||||
.cmp(&(
|
||||
k2.is_delta,
|
||||
k2.key_range.start,
|
||||
k2.key_range.end,
|
||||
k2.lsn_range.start,
|
||||
k2.lsn_range.end,
|
||||
))
|
||||
});
|
||||
assert_eq!(
|
||||
all_layers,
|
||||
vec![
|
||||
@@ -7718,7 +7659,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_neon_test_record() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_neon_test_record").await?;
|
||||
@@ -7910,7 +7850,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
|
||||
@@ -8107,7 +8046,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_generate_key_retention() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_generate_key_retention").await?;
|
||||
@@ -8455,7 +8393,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_with_retain_lsns() -> anyhow::Result<()> {
|
||||
let harness =
|
||||
@@ -8696,7 +8633,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()>
|
||||
{
|
||||
@@ -8905,7 +8841,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
|
||||
@@ -9107,7 +9042,6 @@ mod tests {
|
||||
//
|
||||
// When querying the key range [A, B) we need to read at different LSN ranges
|
||||
// for [A, C) and [C, B). This test checks that the described edge case is handled correctly.
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_vectored_read_with_nested_image_layer() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_vectored_read_with_nested_image_layer").await?;
|
||||
@@ -9222,350 +9156,4 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
|
||||
(
|
||||
k1.is_delta,
|
||||
k1.key_range.start,
|
||||
k1.key_range.end,
|
||||
k1.lsn_range.start,
|
||||
k1.lsn_range.end,
|
||||
)
|
||||
.cmp(&(
|
||||
k2.is_delta,
|
||||
k2.key_range.start,
|
||||
k2.key_range.end,
|
||||
k2.lsn_range.start,
|
||||
k2.lsn_range.end,
|
||||
))
|
||||
}
|
||||
|
||||
async fn inspect_and_sort(
|
||||
tline: &Arc<Timeline>,
|
||||
filter: Option<std::ops::Range<Key>>,
|
||||
) -> Vec<PersistentLayerKey> {
|
||||
let mut all_layers = tline.inspect_historic_layers().await.unwrap();
|
||||
if let Some(filter) = filter {
|
||||
all_layers.retain(|layer| overlaps_with(&layer.key_range, &filter));
|
||||
}
|
||||
all_layers.sort_by(sort_layer_key);
|
||||
all_layers
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fn check_layer_map_key_eq(
|
||||
mut left: Vec<PersistentLayerKey>,
|
||||
mut right: Vec<PersistentLayerKey>,
|
||||
) {
|
||||
left.sort_by(sort_layer_key);
|
||||
right.sort_by(sort_layer_key);
|
||||
if left != right {
|
||||
eprintln!("---LEFT---");
|
||||
for left in left.iter() {
|
||||
eprintln!("{}", left);
|
||||
}
|
||||
eprintln!("---RIGHT---");
|
||||
for right in right.iter() {
|
||||
eprintln!("{}", right);
|
||||
}
|
||||
assert_eq!(left, right);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn test_simple_partial_bottom_most_compaction() -> anyhow::Result<()> {
|
||||
let harness = TenantHarness::create("test_simple_partial_bottom_most_compaction").await?;
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
|
||||
fn get_key(id: u32) -> Key {
|
||||
// using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
|
||||
let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
|
||||
key.field6 = id;
|
||||
key
|
||||
}
|
||||
|
||||
// img layer at 0x10
|
||||
let img_layer = (0..10)
|
||||
.map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
|
||||
.collect_vec();
|
||||
|
||||
let delta1 = vec![
|
||||
(
|
||||
get_key(1),
|
||||
Lsn(0x20),
|
||||
Value::Image(Bytes::from("value 1@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(2),
|
||||
Lsn(0x30),
|
||||
Value::Image(Bytes::from("value 2@0x30")),
|
||||
),
|
||||
(
|
||||
get_key(3),
|
||||
Lsn(0x40),
|
||||
Value::Image(Bytes::from("value 3@0x40")),
|
||||
),
|
||||
];
|
||||
let delta2 = vec![
|
||||
(
|
||||
get_key(5),
|
||||
Lsn(0x20),
|
||||
Value::Image(Bytes::from("value 5@0x20")),
|
||||
),
|
||||
(
|
||||
get_key(6),
|
||||
Lsn(0x20),
|
||||
Value::Image(Bytes::from("value 6@0x20")),
|
||||
),
|
||||
];
|
||||
let delta3 = vec![
|
||||
(
|
||||
get_key(8),
|
||||
Lsn(0x48),
|
||||
Value::Image(Bytes::from("value 8@0x48")),
|
||||
),
|
||||
(
|
||||
get_key(9),
|
||||
Lsn(0x48),
|
||||
Value::Image(Bytes::from("value 9@0x48")),
|
||||
),
|
||||
];
|
||||
|
||||
let tline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TIMELINE_ID,
|
||||
Lsn(0x10),
|
||||
DEFAULT_PG_VERSION,
|
||||
&ctx,
|
||||
vec![
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
|
||||
DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
|
||||
], // delta layers
|
||||
vec![(Lsn(0x10), img_layer)], // image layers
|
||||
Lsn(0x50),
|
||||
)
|
||||
.await?;
|
||||
|
||||
{
|
||||
// Update GC info
|
||||
let mut guard = tline.gc_info.write().unwrap();
|
||||
*guard = GcInfo {
|
||||
retain_lsns: vec![(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No)],
|
||||
cutoffs: GcCutoffs {
|
||||
time: Lsn(0x30),
|
||||
space: Lsn(0x30),
|
||||
},
|
||||
leases: Default::default(),
|
||||
within_ancestor_pitr: false,
|
||||
};
|
||||
}
|
||||
|
||||
let cancel = CancellationToken::new();
|
||||
|
||||
// Do a partial compaction on key range 0..2
|
||||
tline
|
||||
.partial_compact_with_gc(get_key(0)..get_key(2), &cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
|
||||
check_layer_map_key_eq(
|
||||
all_layers,
|
||||
vec![
|
||||
// newly-generated image layer for the partial compaction range 0-2
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(2),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x21),
|
||||
is_delta: false,
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(10),
|
||||
lsn_range: Lsn(0x10)..Lsn(0x11),
|
||||
is_delta: false,
|
||||
},
|
||||
// delta1 is split and the second part is rewritten
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(2)..get_key(4),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x48),
|
||||
is_delta: true,
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(5)..get_key(7),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x48),
|
||||
is_delta: true,
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(8)..get_key(10),
|
||||
lsn_range: Lsn(0x48)..Lsn(0x50),
|
||||
is_delta: true,
|
||||
},
|
||||
],
|
||||
);
|
||||
|
||||
// Do a partial compaction on key range 2..4
|
||||
tline
|
||||
.partial_compact_with_gc(get_key(2)..get_key(4), &cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
|
||||
check_layer_map_key_eq(
|
||||
all_layers,
|
||||
vec![
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(2),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x21),
|
||||
is_delta: false,
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(10),
|
||||
lsn_range: Lsn(0x10)..Lsn(0x11),
|
||||
is_delta: false,
|
||||
},
|
||||
// image layer generated for the compaction range 2-4
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(2)..get_key(4),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x21),
|
||||
is_delta: false,
|
||||
},
|
||||
// we have key2/key3 above the retain_lsn, so we still need this delta layer
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(2)..get_key(4),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x48),
|
||||
is_delta: true,
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(5)..get_key(7),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x48),
|
||||
is_delta: true,
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(8)..get_key(10),
|
||||
lsn_range: Lsn(0x48)..Lsn(0x50),
|
||||
is_delta: true,
|
||||
},
|
||||
],
|
||||
);
|
||||
|
||||
// Do a partial compaction on key range 4..9
|
||||
tline
|
||||
.partial_compact_with_gc(get_key(4)..get_key(9), &cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
|
||||
check_layer_map_key_eq(
|
||||
all_layers,
|
||||
vec![
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(2),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x21),
|
||||
is_delta: false,
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(10),
|
||||
lsn_range: Lsn(0x10)..Lsn(0x11),
|
||||
is_delta: false,
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(2)..get_key(4),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x21),
|
||||
is_delta: false,
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(2)..get_key(4),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x48),
|
||||
is_delta: true,
|
||||
},
|
||||
// image layer generated for this compaction range
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(4)..get_key(9),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x21),
|
||||
is_delta: false,
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(8)..get_key(10),
|
||||
lsn_range: Lsn(0x48)..Lsn(0x50),
|
||||
is_delta: true,
|
||||
},
|
||||
],
|
||||
);
|
||||
|
||||
// Do a partial compaction on key range 9..10
|
||||
tline
|
||||
.partial_compact_with_gc(get_key(9)..get_key(10), &cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
|
||||
check_layer_map_key_eq(
|
||||
all_layers,
|
||||
vec![
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(2),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x21),
|
||||
is_delta: false,
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(10),
|
||||
lsn_range: Lsn(0x10)..Lsn(0x11),
|
||||
is_delta: false,
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(2)..get_key(4),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x21),
|
||||
is_delta: false,
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(2)..get_key(4),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x48),
|
||||
is_delta: true,
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(4)..get_key(9),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x21),
|
||||
is_delta: false,
|
||||
},
|
||||
// image layer generated for the compaction range
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(9)..get_key(10),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x21),
|
||||
is_delta: false,
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(8)..get_key(10),
|
||||
lsn_range: Lsn(0x48)..Lsn(0x50),
|
||||
is_delta: true,
|
||||
},
|
||||
],
|
||||
);
|
||||
|
||||
// Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones.
|
||||
tline
|
||||
.partial_compact_with_gc(get_key(0)..get_key(10), &cancel, EnumSet::new(), &ctx)
|
||||
.await
|
||||
.unwrap();
|
||||
let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
|
||||
check_layer_map_key_eq(
|
||||
all_layers,
|
||||
vec![
|
||||
// aha, we removed all unnecessary image/delta layers and got a very clean layer map!
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(0)..get_key(10),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x21),
|
||||
is_delta: false,
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(2)..get_key(4),
|
||||
lsn_range: Lsn(0x20)..Lsn(0x48),
|
||||
is_delta: true,
|
||||
},
|
||||
PersistentLayerKey {
|
||||
key_range: get_key(8)..get_key(10),
|
||||
lsn_range: Lsn(0x48)..Lsn(0x50),
|
||||
is_delta: true,
|
||||
},
|
||||
],
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
//! may lead to a data loss.
|
||||
//!
|
||||
pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
|
||||
use pageserver_api::models::AuxFilePolicy;
|
||||
use pageserver_api::models::CompactionAlgorithmSettings;
|
||||
use pageserver_api::models::EvictionPolicy;
|
||||
use pageserver_api::models::{self, ThrottleConfig};
|
||||
@@ -340,6 +341,10 @@ pub struct TenantConfOpt {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub image_layer_creation_check_threshold: Option<u8>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub switch_aux_file_policy: Option<AuxFilePolicy>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(with = "humantime_serde")]
|
||||
#[serde(default)]
|
||||
@@ -349,10 +354,6 @@ pub struct TenantConfOpt {
|
||||
#[serde(with = "humantime_serde")]
|
||||
#[serde(default)]
|
||||
pub lsn_lease_length_for_ts: Option<Duration>,
|
||||
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(default)]
|
||||
pub timeline_offloading: Option<bool>,
|
||||
}
|
||||
|
||||
impl TenantConfOpt {
|
||||
@@ -409,15 +410,15 @@ impl TenantConfOpt {
|
||||
image_layer_creation_check_threshold: self
|
||||
.image_layer_creation_check_threshold
|
||||
.unwrap_or(global_conf.image_layer_creation_check_threshold),
|
||||
switch_aux_file_policy: self
|
||||
.switch_aux_file_policy
|
||||
.unwrap_or(global_conf.switch_aux_file_policy),
|
||||
lsn_lease_length: self
|
||||
.lsn_lease_length
|
||||
.unwrap_or(global_conf.lsn_lease_length),
|
||||
lsn_lease_length_for_ts: self
|
||||
.lsn_lease_length_for_ts
|
||||
.unwrap_or(global_conf.lsn_lease_length_for_ts),
|
||||
timeline_offloading: self
|
||||
.lazy_slru_download
|
||||
.unwrap_or(global_conf.timeline_offloading),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -469,9 +470,9 @@ impl From<TenantConfOpt> for models::TenantConfig {
|
||||
lazy_slru_download: value.lazy_slru_download,
|
||||
timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
|
||||
image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
|
||||
switch_aux_file_policy: value.switch_aux_file_policy,
|
||||
lsn_lease_length: value.lsn_lease_length.map(humantime),
|
||||
lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
|
||||
timeline_offloading: value.timeline_offloading,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,57 +0,0 @@
|
||||
use anyhow::Result;
|
||||
use serde::Serialize;
|
||||
use std::ops::AddAssign;
|
||||
use std::time::Duration;
|
||||
|
||||
///
|
||||
/// Result of performing GC
|
||||
///
|
||||
#[derive(Default, Serialize, Debug)]
|
||||
pub struct GcResult {
|
||||
pub layers_total: u64,
|
||||
pub layers_needed_by_cutoff: u64,
|
||||
pub layers_needed_by_pitr: u64,
|
||||
pub layers_needed_by_branches: u64,
|
||||
pub layers_needed_by_leases: u64,
|
||||
pub layers_not_updated: u64,
|
||||
pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
|
||||
|
||||
#[serde(serialize_with = "serialize_duration_as_millis")]
|
||||
pub elapsed: Duration,
|
||||
|
||||
/// The layers which were garbage collected.
|
||||
///
|
||||
/// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
|
||||
/// dropped in tests.
|
||||
#[cfg(feature = "testing")]
|
||||
#[serde(skip)]
|
||||
pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
|
||||
}
|
||||
|
||||
// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
|
||||
fn serialize_duration_as_millis<S>(d: &Duration, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
d.as_millis().serialize(serializer)
|
||||
}
|
||||
|
||||
impl AddAssign for GcResult {
|
||||
fn add_assign(&mut self, other: Self) {
|
||||
self.layers_total += other.layers_total;
|
||||
self.layers_needed_by_pitr += other.layers_needed_by_pitr;
|
||||
self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
|
||||
self.layers_needed_by_branches += other.layers_needed_by_branches;
|
||||
self.layers_needed_by_leases += other.layers_needed_by_leases;
|
||||
self.layers_not_updated += other.layers_not_updated;
|
||||
self.layers_removed += other.layers_removed;
|
||||
|
||||
self.elapsed += other.elapsed;
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
{
|
||||
let mut other = other;
|
||||
self.doomed_layers.append(&mut other.doomed_layers);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -48,9 +48,9 @@ mod layer_coverage;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use crate::keyspace::KeyPartitioning;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use anyhow::Result;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
|
||||
use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
|
||||
@@ -1959,7 +1959,7 @@ impl TenantManager {
|
||||
attempt.before_reset_tenant();
|
||||
|
||||
let (_guard, progress) = utils::completion::channel();
|
||||
match tenant.shutdown(progress, ShutdownMode::Flush).await {
|
||||
match tenant.shutdown(progress, ShutdownMode::Hard).await {
|
||||
Ok(()) => {
|
||||
slot_guard.drop_old_value().expect("it was just shutdown");
|
||||
}
|
||||
@@ -2811,7 +2811,7 @@ where
|
||||
}
|
||||
|
||||
use {
|
||||
crate::tenant::gc_result::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||
crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
|
||||
utils::http::error::ApiError,
|
||||
};
|
||||
|
||||
|
||||
@@ -190,7 +190,6 @@ use chrono::{NaiveDateTime, Utc};
|
||||
pub(crate) use download::download_initdb_tar_zst;
|
||||
use pageserver_api::models::TimelineArchivalState;
|
||||
use pageserver_api::shard::{ShardIndex, TenantShardId};
|
||||
use regex::Regex;
|
||||
use scopeguard::ScopeGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use utils::backoff::{
|
||||
@@ -200,7 +199,7 @@ use utils::pausable_failpoint;
|
||||
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::{Arc, Mutex, OnceLock};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Duration;
|
||||
|
||||
use remote_storage::{
|
||||
@@ -246,11 +245,11 @@ use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
|
||||
use super::Generation;
|
||||
|
||||
pub(crate) use download::{
|
||||
download_index_part, download_tenant_manifest, is_temp_download_file,
|
||||
do_download_tenant_manifest, download_index_part, is_temp_download_file,
|
||||
list_remote_tenant_shards, list_remote_timelines,
|
||||
};
|
||||
pub(crate) use index::LayerFileMetadata;
|
||||
pub(crate) use upload::upload_initdb_dir;
|
||||
pub(crate) use upload::{upload_initdb_dir, upload_tenant_manifest};
|
||||
|
||||
// Occasional network issues and such can cause remote operations to fail, and
|
||||
// that's expected. If a download fails, we log it at info-level, and retry.
|
||||
@@ -275,6 +274,12 @@ pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
|
||||
/// which we warn and skip.
|
||||
const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
|
||||
/// Hardcode a generation for the tenant manifest for now so that we don't
|
||||
/// need to deal with generation-less manifests in the future.
|
||||
///
|
||||
/// TODO: add proper generation support to all the places that use this.
|
||||
pub(crate) const TENANT_MANIFEST_GENERATION: Generation = Generation::new(1);
|
||||
|
||||
pub enum MaybeDeletedIndexPart {
|
||||
IndexPart(IndexPart),
|
||||
Deleted(IndexPart),
|
||||
@@ -1445,7 +1450,7 @@ impl RemoteTimelineClient {
|
||||
let remote_path = remote_layer_path(
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&self.timeline_id,
|
||||
uploaded.metadata().shard,
|
||||
self.tenant_shard_id.to_index(),
|
||||
&uploaded.layer_desc().layer_name(),
|
||||
uploaded.metadata().generation,
|
||||
);
|
||||
@@ -1486,7 +1491,7 @@ impl RemoteTimelineClient {
|
||||
&adopted
|
||||
.get_timeline_id()
|
||||
.expect("Source timeline should be alive"),
|
||||
adopted.metadata().shard,
|
||||
self.tenant_shard_id.to_index(),
|
||||
&adopted.layer_desc().layer_name(),
|
||||
adopted.metadata().generation,
|
||||
);
|
||||
@@ -1494,7 +1499,7 @@ impl RemoteTimelineClient {
|
||||
let target_remote_path = remote_layer_path(
|
||||
&self.tenant_shard_id.tenant_id,
|
||||
&self.timeline_id,
|
||||
adopted_as.metadata().shard,
|
||||
self.tenant_shard_id.to_index(),
|
||||
&adopted_as.layer_desc().layer_name(),
|
||||
adopted_as.metadata().generation,
|
||||
);
|
||||
@@ -2201,18 +2206,6 @@ impl RemoteTimelineClient {
|
||||
inner.initialized_mut()?;
|
||||
Ok(UploadQueueAccessor { inner })
|
||||
}
|
||||
|
||||
pub(crate) fn no_pending_work(&self) -> bool {
|
||||
let inner = self.upload_queue.lock().unwrap();
|
||||
match &*inner {
|
||||
UploadQueue::Uninitialized
|
||||
| UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => true,
|
||||
UploadQueue::Stopped(UploadQueueStopped::Deletable(x)) => {
|
||||
x.upload_queue_for_deletion.no_pending_work()
|
||||
}
|
||||
UploadQueue::Initialized(x) => x.no_pending_work(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct UploadQueueAccessor<'a> {
|
||||
@@ -2246,12 +2239,6 @@ pub fn remote_tenant_manifest_path(
|
||||
RemotePath::from_string(&path).expect("Failed to construct path")
|
||||
}
|
||||
|
||||
/// Prefix to all generations' manifest objects in a tenant shard
|
||||
pub fn remote_tenant_manifest_prefix(tenant_shard_id: &TenantShardId) -> RemotePath {
|
||||
let path = format!("tenants/{tenant_shard_id}/tenant-manifest",);
|
||||
RemotePath::from_string(&path).expect("Failed to construct path")
|
||||
}
|
||||
|
||||
pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
|
||||
let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
|
||||
RemotePath::from_string(&path).expect("Failed to construct path")
|
||||
@@ -2346,15 +2333,6 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Given the key of a tenant manifest, parse out the generation number
|
||||
pub(crate) fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option<Generation> {
|
||||
static RE: OnceLock<Regex> = OnceLock::new();
|
||||
let re = RE.get_or_init(|| Regex::new(r".+tenant-manifest-([0-9a-f]{8}).json").unwrap());
|
||||
re.captures(path.get_path().as_str())
|
||||
.and_then(|c| c.get(1))
|
||||
.and_then(|m| Generation::parse_suffix(m.as_str()))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
@@ -20,9 +20,7 @@ use utils::backoff;
|
||||
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::RequestContext;
|
||||
use crate::span::{
|
||||
debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_id,
|
||||
};
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
|
||||
use crate::tenant::storage_layer::LayerName;
|
||||
use crate::tenant::Generation;
|
||||
@@ -38,10 +36,9 @@ use utils::pausable_failpoint;
|
||||
use super::index::{IndexPart, LayerFileMetadata};
|
||||
use super::manifest::TenantManifest;
|
||||
use super::{
|
||||
parse_remote_index_path, parse_remote_tenant_manifest_path, remote_index_path,
|
||||
remote_initdb_archive_path, remote_initdb_preserved_archive_path, remote_tenant_manifest_path,
|
||||
remote_tenant_manifest_prefix, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
|
||||
FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
|
||||
parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
|
||||
remote_initdb_preserved_archive_path, remote_tenant_manifest_path, remote_tenant_path,
|
||||
FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
|
||||
};
|
||||
|
||||
///
|
||||
@@ -368,34 +365,32 @@ async fn do_download_remote_path_retry_forever(
|
||||
.await
|
||||
}
|
||||
|
||||
async fn do_download_tenant_manifest(
|
||||
pub async fn do_download_tenant_manifest(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
_timeline_id: Option<&TimelineId>,
|
||||
generation: Generation,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(TenantManifest, Generation, SystemTime), DownloadError> {
|
||||
) -> Result<(TenantManifest, Generation), DownloadError> {
|
||||
// TODO: generation support
|
||||
let generation = super::TENANT_MANIFEST_GENERATION;
|
||||
let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
|
||||
|
||||
let (manifest_bytes, manifest_bytes_mtime) =
|
||||
let (manifest_bytes, _manifest_bytes_mtime) =
|
||||
do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
|
||||
|
||||
let tenant_manifest = TenantManifest::from_json_bytes(&manifest_bytes)
|
||||
.with_context(|| format!("deserialize tenant manifest file at {remote_path:?}"))
|
||||
.map_err(DownloadError::Other)?;
|
||||
|
||||
Ok((tenant_manifest, generation, manifest_bytes_mtime))
|
||||
Ok((tenant_manifest, generation))
|
||||
}
|
||||
|
||||
async fn do_download_index_part(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: Option<&TimelineId>,
|
||||
timeline_id: &TimelineId,
|
||||
index_generation: Generation,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
|
||||
let timeline_id =
|
||||
timeline_id.expect("A timeline ID is always provided when downloading an index");
|
||||
let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
|
||||
|
||||
let (index_part_bytes, index_part_mtime) =
|
||||
@@ -408,79 +403,59 @@ async fn do_download_index_part(
|
||||
Ok((index_part, index_generation, index_part_mtime))
|
||||
}
|
||||
|
||||
/// Metadata objects are "generationed", meaning that they include a generation suffix. This
|
||||
/// function downloads the object with the highest generation <= `my_generation`.
|
||||
/// index_part.json objects are suffixed with a generation number, so we cannot
|
||||
/// directly GET the latest index part without doing some probing.
|
||||
///
|
||||
/// Data objects (layer files) also include a generation in their path, but there is no equivalent
|
||||
/// search process, because their reference from an index includes the generation.
|
||||
///
|
||||
/// An expensive object listing operation is only done if necessary: the typical fast path is to issue two
|
||||
/// GET operations, one to our own generation (stale attachment case), and one to the immediately preceding
|
||||
/// generation (normal case when migrating/restarting). Only if both of these return 404 do we fall back
|
||||
/// to listing objects.
|
||||
///
|
||||
/// * `my_generation`: the value of `[crate::tenant::Tenant::generation]`
|
||||
/// * `what`: for logging, what object are we downloading
|
||||
/// * `prefix`: when listing objects, use this prefix (i.e. the part of the object path before the generation)
|
||||
/// * `do_download`: a GET of the object in a particular generation, which should **retry indefinitely** unless
|
||||
/// `cancel`` has fired. This function does not do its own retries of GET operations, and relies
|
||||
/// on the function passed in to do so.
|
||||
/// * `parse_path`: parse a fully qualified remote storage path to get the generation of the object.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
/// In this function we probe for the most recent index in a generation <= our current generation.
|
||||
/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
|
||||
#[tracing::instrument(skip_all, fields(generation=?my_generation))]
|
||||
pub(crate) async fn download_generation_object<'a, T, DF, DFF, PF>(
|
||||
storage: &'a GenericRemoteStorage,
|
||||
tenant_shard_id: &'a TenantShardId,
|
||||
timeline_id: Option<&'a TimelineId>,
|
||||
pub(crate) async fn download_index_part(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
my_generation: Generation,
|
||||
what: &str,
|
||||
prefix: RemotePath,
|
||||
do_download: DF,
|
||||
parse_path: PF,
|
||||
cancel: &'a CancellationToken,
|
||||
) -> Result<(T, Generation, SystemTime), DownloadError>
|
||||
where
|
||||
DF: Fn(
|
||||
&'a GenericRemoteStorage,
|
||||
&'a TenantShardId,
|
||||
Option<&'a TimelineId>,
|
||||
Generation,
|
||||
&'a CancellationToken,
|
||||
) -> DFF,
|
||||
DFF: Future<Output = Result<(T, Generation, SystemTime), DownloadError>>,
|
||||
PF: Fn(RemotePath) -> Option<Generation>,
|
||||
T: 'static,
|
||||
{
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
if my_generation.is_none() {
|
||||
// Operating without generations: just fetch the generation-less path
|
||||
return do_download(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
|
||||
return do_download_index_part(
|
||||
storage,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
my_generation,
|
||||
cancel,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
|
||||
// Stale case: If we were intentionally attached in a stale generation, the remote object may already
|
||||
// exist in our generation.
|
||||
// Stale case: If we were intentionally attached in a stale generation, there may already be a remote
|
||||
// index in our generation.
|
||||
//
|
||||
// This is an optimization to avoid doing the listing for the general case below.
|
||||
let res = do_download(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
|
||||
let res =
|
||||
do_download_index_part(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
|
||||
match res {
|
||||
Ok(decoded) => {
|
||||
tracing::debug!("Found {what} from current generation (this is a stale attachment)");
|
||||
return Ok(decoded);
|
||||
Ok(index_part) => {
|
||||
tracing::debug!(
|
||||
"Found index_part from current generation (this is a stale attachment)"
|
||||
);
|
||||
return Ok(index_part);
|
||||
}
|
||||
Err(DownloadError::NotFound) => {}
|
||||
Err(e) => return Err(e),
|
||||
};
|
||||
|
||||
// Typical case: the previous generation of this tenant was running healthily, and had uploaded the object
|
||||
// we are seeking in that generation. We may safely start from this index without doing a listing, because:
|
||||
// Typical case: the previous generation of this tenant was running healthily, and had uploaded
|
||||
// and index part. We may safely start from this index without doing a listing, because:
|
||||
// - We checked for current generation case above
|
||||
// - generations > my_generation are to be ignored
|
||||
// - any other objects that exist would have an older generation than `previous_gen`, and
|
||||
// we want to find the most recent object from a previous generation.
|
||||
// - any other indices that exist would have an older generation than `previous_gen`, and
|
||||
// we want to find the most recent index from a previous generation.
|
||||
//
|
||||
// This is an optimization to avoid doing the listing for the general case below.
|
||||
let res = do_download(
|
||||
let res = do_download_index_part(
|
||||
storage,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
@@ -489,12 +464,14 @@ where
|
||||
)
|
||||
.await;
|
||||
match res {
|
||||
Ok(decoded) => {
|
||||
tracing::debug!("Found {what} from previous generation");
|
||||
return Ok(decoded);
|
||||
Ok(index_part) => {
|
||||
tracing::debug!("Found index_part from previous generation");
|
||||
return Ok(index_part);
|
||||
}
|
||||
Err(DownloadError::NotFound) => {
|
||||
tracing::debug!("No {what} found from previous generation, falling back to listing");
|
||||
tracing::debug!(
|
||||
"No index_part found from previous generation, falling back to listing"
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
return Err(e);
|
||||
@@ -504,10 +481,12 @@ where
|
||||
// General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
|
||||
// objects, and select the highest one with a generation <= my_generation. Constructing the prefix is equivalent
|
||||
// to constructing a full index path with no generation, because the generation is a suffix.
|
||||
let paths = download_retry(
|
||||
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
|
||||
|
||||
let indices = download_retry(
|
||||
|| async {
|
||||
storage
|
||||
.list(Some(&prefix), ListingMode::NoDelimiter, None, cancel)
|
||||
.list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel)
|
||||
.await
|
||||
},
|
||||
"list index_part files",
|
||||
@@ -518,22 +497,22 @@ where
|
||||
|
||||
// General case logic for which index to use: the latest index whose generation
|
||||
// is <= our own. See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
|
||||
let max_previous_generation = paths
|
||||
let max_previous_generation = indices
|
||||
.into_iter()
|
||||
.filter_map(|o| parse_path(o.key))
|
||||
.filter_map(|o| parse_remote_index_path(o.key))
|
||||
.filter(|g| g <= &my_generation)
|
||||
.max();
|
||||
|
||||
match max_previous_generation {
|
||||
Some(g) => {
|
||||
tracing::debug!("Found {what} in generation {g:?}");
|
||||
do_download(storage, tenant_shard_id, timeline_id, g, cancel).await
|
||||
tracing::debug!("Found index_part in generation {g:?}");
|
||||
do_download_index_part(storage, tenant_shard_id, timeline_id, g, cancel).await
|
||||
}
|
||||
None => {
|
||||
// Migration from legacy pre-generation state: we have a generation but no prior
|
||||
// attached pageservers did. Try to load from a no-generation path.
|
||||
tracing::debug!("No {what}* found");
|
||||
do_download(
|
||||
tracing::debug!("No index_part.json* found");
|
||||
do_download_index_part(
|
||||
storage,
|
||||
tenant_shard_id,
|
||||
timeline_id,
|
||||
@@ -545,57 +524,6 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
/// index_part.json objects are suffixed with a generation number, so we cannot
|
||||
/// directly GET the latest index part without doing some probing.
|
||||
///
|
||||
/// In this function we probe for the most recent index in a generation <= our current generation.
|
||||
/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
|
||||
pub(crate) async fn download_index_part(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
timeline_id: &TimelineId,
|
||||
my_generation: Generation,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
|
||||
download_generation_object(
|
||||
storage,
|
||||
tenant_shard_id,
|
||||
Some(timeline_id),
|
||||
my_generation,
|
||||
"index_part",
|
||||
index_prefix,
|
||||
do_download_index_part,
|
||||
parse_remote_index_path,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn download_tenant_manifest(
|
||||
storage: &GenericRemoteStorage,
|
||||
tenant_shard_id: &TenantShardId,
|
||||
my_generation: Generation,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<(TenantManifest, Generation, SystemTime), DownloadError> {
|
||||
let manifest_prefix = remote_tenant_manifest_prefix(tenant_shard_id);
|
||||
|
||||
download_generation_object(
|
||||
storage,
|
||||
tenant_shard_id,
|
||||
None,
|
||||
my_generation,
|
||||
"tenant-manifest",
|
||||
manifest_prefix,
|
||||
do_download_tenant_manifest,
|
||||
parse_remote_tenant_manifest_path,
|
||||
cancel,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn download_initdb_tar_zst(
|
||||
conf: &'static PageServerConf,
|
||||
storage: &GenericRemoteStorage,
|
||||
|
||||
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
/// Tenant-shard scoped manifest
|
||||
#[derive(Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct TenantManifest {
|
||||
/// Debugging aid describing the version of this manifest.
|
||||
/// Can also be used for distinguishing breaking changes later on.
|
||||
@@ -23,7 +23,7 @@ pub struct TenantManifest {
|
||||
/// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`],
|
||||
/// but the two datastructures serve different needs, this is for a persistent disk format
|
||||
/// that must be backwards compatible, while the other is only for informative purposes.
|
||||
#[derive(Clone, Serialize, Deserialize, Copy, PartialEq, Eq)]
|
||||
#[derive(Clone, Serialize, Deserialize, Copy)]
|
||||
pub struct OffloadedTimelineManifest {
|
||||
pub timeline_id: TimelineId,
|
||||
/// Whether the timeline has a parent it has been branched off from or not
|
||||
|
||||
@@ -11,11 +11,11 @@ mod layer_name;
|
||||
pub mod merge_iterator;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext};
|
||||
use crate::repository::Value;
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE};
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::value::Value;
|
||||
use std::cmp::{Ordering, Reverse};
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
@@ -196,9 +196,6 @@ impl ValuesReconstructState {
|
||||
/// Returns true if this was the last value needed for the key and false otherwise.
|
||||
///
|
||||
/// If the key is done after the update, mark it as such.
|
||||
///
|
||||
/// If the key is in the sparse keyspace (i.e., aux files), we do not track them in
|
||||
/// `key_done`.
|
||||
pub(crate) fn update_key(
|
||||
&mut self,
|
||||
key: &Key,
|
||||
@@ -209,18 +206,10 @@ impl ValuesReconstructState {
|
||||
.keys
|
||||
.entry(*key)
|
||||
.or_insert(Ok(VectoredValueReconstructState::default()));
|
||||
let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key);
|
||||
|
||||
if let Ok(state) = state {
|
||||
let key_done = match state.situation {
|
||||
ValueReconstructSituation::Complete => {
|
||||
if is_sparse_key {
|
||||
// Sparse keyspace might be visited multiple times because
|
||||
// we don't track unmapped keyspaces.
|
||||
return ValueReconstructSituation::Complete;
|
||||
} else {
|
||||
unreachable!()
|
||||
}
|
||||
}
|
||||
ValueReconstructSituation::Complete => unreachable!(),
|
||||
ValueReconstructSituation::Continue => match value {
|
||||
Value::Image(img) => {
|
||||
state.img = Some((lsn, img));
|
||||
@@ -245,9 +234,7 @@ impl ValuesReconstructState {
|
||||
|
||||
if key_done && state.situation == ValueReconstructSituation::Continue {
|
||||
state.situation = ValueReconstructSituation::Complete;
|
||||
if !is_sparse_key {
|
||||
self.keys_done.add_key(*key);
|
||||
}
|
||||
self.keys_done.add_key(*key);
|
||||
}
|
||||
|
||||
state.situation
|
||||
|
||||
@@ -5,8 +5,7 @@ use pageserver_api::key::{Key, KEY_SIZE};
|
||||
use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
|
||||
|
||||
use crate::tenant::storage_layer::Layer;
|
||||
use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
|
||||
use pageserver_api::value::Value;
|
||||
use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
|
||||
|
||||
use super::layer::S3_UPLOAD_LIMIT;
|
||||
use super::{
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache::{self, FileId, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{
|
||||
@@ -45,7 +46,7 @@ use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
|
||||
use crate::virtual_file::IoBufferMut;
|
||||
use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
|
||||
use crate::TEMP_FILE_SUFFIX;
|
||||
use crate::{walrecord, TEMP_FILE_SUFFIX};
|
||||
use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
|
||||
use anyhow::{anyhow, bail, ensure, Context, Result};
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
@@ -53,11 +54,9 @@ use futures::StreamExt;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::config::MaxVectoredReadBytes;
|
||||
use pageserver_api::key::DBDIR_KEY;
|
||||
use pageserver_api::key::{Key, KEY_SIZE};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::ImageCompressionAlgorithm;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::value::Value;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
@@ -270,7 +269,7 @@ impl AsLayerDesc for DeltaLayer {
|
||||
}
|
||||
|
||||
impl DeltaLayer {
|
||||
pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
self.desc.dump();
|
||||
|
||||
if !verbose {
|
||||
@@ -653,10 +652,6 @@ impl DeltaLayerWriter {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.inner.as_ref().unwrap().num_keys == 0
|
||||
}
|
||||
|
||||
///
|
||||
/// Append a key-value pair to the file.
|
||||
///
|
||||
@@ -1298,7 +1293,7 @@ impl DeltaLayerInner {
|
||||
// is it an image or will_init walrecord?
|
||||
// FIXME: this could be handled by threading the BlobRef to the
|
||||
// VectoredReadBuilder
|
||||
let will_init = pageserver_api::value::ValueBytes::will_init(&data)
|
||||
let will_init = crate::repository::ValueBytes::will_init(&data)
|
||||
.inspect_err(|_e| {
|
||||
#[cfg(feature = "testing")]
|
||||
tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
|
||||
@@ -1361,7 +1356,7 @@ impl DeltaLayerInner {
|
||||
format!(" img {} bytes", img.len())
|
||||
}
|
||||
Value::WalRecord(rec) => {
|
||||
let wal_desc = pageserver_api::record::describe_wal_record(&rec)?;
|
||||
let wal_desc = walrecord::describe_wal_record(&rec)?;
|
||||
format!(
|
||||
" rec {} bytes will_init: {} {}",
|
||||
buf.len(),
|
||||
@@ -1442,7 +1437,7 @@ impl DeltaLayerInner {
|
||||
offset
|
||||
}
|
||||
|
||||
pub fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
|
||||
pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
|
||||
let block_reader = FileBlockReader::new(&self.file, self.file_id);
|
||||
let tree_reader =
|
||||
DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
|
||||
@@ -1615,6 +1610,7 @@ pub(crate) mod test {
|
||||
use rand::RngCore;
|
||||
|
||||
use super::*;
|
||||
use crate::repository::Value;
|
||||
use crate::tenant::harness::TIMELINE_ID;
|
||||
use crate::tenant::storage_layer::{Layer, ResidentLayer};
|
||||
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
|
||||
@@ -1626,7 +1622,6 @@ pub(crate) mod test {
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
/// Construct an index for a fictional delta layer and and then
|
||||
/// traverse in order to plan vectored reads for a query. Finally,
|
||||
@@ -1979,8 +1974,8 @@ pub(crate) mod test {
|
||||
|
||||
#[tokio::test]
|
||||
async fn copy_delta_prefix_smoke() {
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
|
||||
let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
|
||||
.await
|
||||
@@ -2203,7 +2198,6 @@ pub(crate) mod test {
|
||||
(k1, l1).cmp(&(k2, l2))
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
pub(crate) fn sort_delta_value(
|
||||
(k1, l1, v1): &(Key, Lsn, Value),
|
||||
(k2, l2, v2): &(Key, Lsn, Value),
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::{ops::Range, sync::Arc};
|
||||
use std::ops::Range;
|
||||
|
||||
use anyhow::bail;
|
||||
use pageserver_api::{
|
||||
@@ -7,12 +7,9 @@ use pageserver_api::{
|
||||
};
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use pageserver_api::value::Value;
|
||||
use crate::repository::Value;
|
||||
|
||||
use super::{
|
||||
merge_iterator::{MergeIterator, MergeIteratorItem},
|
||||
PersistentLayerKey,
|
||||
};
|
||||
use super::merge_iterator::MergeIterator;
|
||||
|
||||
/// A filter iterator over merge iterators (and can be easily extended to other types of iterators).
|
||||
///
|
||||
@@ -51,10 +48,10 @@ impl<'a> FilterIterator<'a> {
|
||||
})
|
||||
}
|
||||
|
||||
async fn next_inner<R: MergeIteratorItem>(&mut self) -> anyhow::Result<Option<R>> {
|
||||
while let Some(item) = self.inner.next_inner::<R>().await? {
|
||||
pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
while let Some(item) = self.inner.next().await? {
|
||||
while self.current_filter_idx < self.retain_key_filters.len()
|
||||
&& item.key_lsn_value().0 >= self.retain_key_filters[self.current_filter_idx].end
|
||||
&& item.0 >= self.retain_key_filters[self.current_filter_idx].end
|
||||
{
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
@@ -71,7 +68,7 @@ impl<'a> FilterIterator<'a> {
|
||||
// ^ current filter (nothing)
|
||||
return Ok(None);
|
||||
}
|
||||
if self.retain_key_filters[self.current_filter_idx].contains(&item.key_lsn_value().0) {
|
||||
if self.retain_key_filters[self.current_filter_idx].contains(&item.0) {
|
||||
// [filter region] [filter region] [filter region]
|
||||
// ^ item
|
||||
// ^ current filter
|
||||
@@ -84,16 +81,6 @@ impl<'a> FilterIterator<'a> {
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
self.next_inner().await
|
||||
}
|
||||
|
||||
pub async fn next_with_trace(
|
||||
&mut self,
|
||||
) -> anyhow::Result<Option<((Key, Lsn, Value), Arc<PersistentLayerKey>)>> {
|
||||
self.next_inner().await
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -134,8 +121,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn filter_keyspace_iterator() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator")
|
||||
.await
|
||||
|
||||
@@ -28,6 +28,7 @@
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache::{self, FileId, PAGE_SZ};
|
||||
use crate::repository::{Key, Value, KEY_SIZE};
|
||||
use crate::tenant::blob_io::BlobWriter;
|
||||
use crate::tenant::block_io::{BlockBuf, FileBlockReader};
|
||||
use crate::tenant::disk_btree::{
|
||||
@@ -50,10 +51,8 @@ use hex;
|
||||
use itertools::Itertools;
|
||||
use pageserver_api::config::MaxVectoredReadBytes;
|
||||
use pageserver_api::key::DBDIR_KEY;
|
||||
use pageserver_api::key::{Key, KEY_SIZE};
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::shard::{ShardIdentity, TenantShardId};
|
||||
use pageserver_api::value::Value;
|
||||
use rand::{distributions::Alphanumeric, Rng};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::VecDeque;
|
||||
@@ -231,7 +230,7 @@ impl AsLayerDesc for ImageLayer {
|
||||
}
|
||||
|
||||
impl ImageLayer {
|
||||
pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
|
||||
self.desc.dump();
|
||||
|
||||
if !verbose {
|
||||
@@ -1126,7 +1125,6 @@ mod test {
|
||||
use pageserver_api::{
|
||||
key::Key,
|
||||
shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
|
||||
value::Value,
|
||||
};
|
||||
use utils::{
|
||||
generation::Generation,
|
||||
@@ -1136,6 +1134,7 @@ mod test {
|
||||
|
||||
use crate::{
|
||||
context::RequestContext,
|
||||
repository::Value,
|
||||
tenant::{
|
||||
config::TenantConf,
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
|
||||
@@ -7,25 +7,23 @@
|
||||
use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
|
||||
use crate::config::PageServerConf;
|
||||
use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::tenant::ephemeral_file::EphemeralFile;
|
||||
use crate::tenant::timeline::GetVectoredError;
|
||||
use crate::tenant::PageReconstructError;
|
||||
use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
|
||||
use crate::{l0_flush, page_cache};
|
||||
use anyhow::{anyhow, Result};
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use camino::Utf8PathBuf;
|
||||
use pageserver_api::key::CompactKey;
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::models::InMemoryLayerInfo;
|
||||
use pageserver_api::shard::TenantShardId;
|
||||
use pageserver_api::value::Value;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::Instant;
|
||||
use tracing::*;
|
||||
use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
|
||||
use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta};
|
||||
// avoid binding to Write (conflicts with std::io::Write)
|
||||
// while being able to use std::fmt::Write's methods
|
||||
use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
|
||||
@@ -67,8 +65,6 @@ pub struct InMemoryLayer {
|
||||
/// The above fields never change, except for `end_lsn`, which is only set once.
|
||||
/// All other changing parts are in `inner`, and protected by a mutex.
|
||||
inner: RwLock<InMemoryLayerInner>,
|
||||
|
||||
estimated_in_mem_size: AtomicU64,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for InMemoryLayer {
|
||||
@@ -455,7 +451,6 @@ impl InMemoryLayer {
|
||||
len,
|
||||
will_init,
|
||||
} = index_entry.unpack();
|
||||
|
||||
reads.entry(key).or_default().push(ValueRead {
|
||||
entry_lsn: *entry_lsn,
|
||||
read: vectored_dio_read::LogicalRead::new(
|
||||
@@ -517,6 +512,68 @@ impl InMemoryLayer {
|
||||
}
|
||||
}
|
||||
|
||||
/// Offset of a particular Value within a serialized batch.
|
||||
struct SerializedBatchOffset {
|
||||
key: CompactKey,
|
||||
lsn: Lsn,
|
||||
// TODO: separate type when we start serde-serializing this value, to avoid coupling
|
||||
// in-memory representation to serialization format.
|
||||
index_entry: IndexEntry,
|
||||
}
|
||||
|
||||
pub struct SerializedBatch {
|
||||
/// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`].
|
||||
pub(crate) raw: Vec<u8>,
|
||||
|
||||
/// Index of values in [`Self::raw`], using offsets relative to the start of the buffer.
|
||||
offsets: Vec<SerializedBatchOffset>,
|
||||
|
||||
/// The highest LSN of any value in the batch
|
||||
pub(crate) max_lsn: Lsn,
|
||||
}
|
||||
|
||||
impl SerializedBatch {
|
||||
pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> anyhow::Result<Self> {
|
||||
// Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
|
||||
// [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
|
||||
let buffer_size = batch.iter().map(|i| i.2).sum::<usize>();
|
||||
let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
|
||||
|
||||
let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
|
||||
let mut max_lsn: Lsn = Lsn(0);
|
||||
for (key, lsn, val_ser_size, val) in batch {
|
||||
let relative_off = cursor.position();
|
||||
|
||||
val.ser_into(&mut cursor)
|
||||
.expect("Writing into in-memory buffer is infallible");
|
||||
|
||||
offsets.push(SerializedBatchOffset {
|
||||
key,
|
||||
lsn,
|
||||
index_entry: IndexEntry::new(IndexEntryNewArgs {
|
||||
base_offset: 0,
|
||||
batch_offset: relative_off,
|
||||
len: val_ser_size,
|
||||
will_init: val.will_init(),
|
||||
})
|
||||
.context("higher-level code ensures that values are within supported ranges")?,
|
||||
});
|
||||
max_lsn = std::cmp::max(max_lsn, lsn);
|
||||
}
|
||||
|
||||
let buffer = cursor.into_inner();
|
||||
|
||||
// Assert that we didn't do any extra allocations while building buffer.
|
||||
debug_assert!(buffer.len() <= buffer_size);
|
||||
|
||||
Ok(Self {
|
||||
raw: buffer,
|
||||
offsets,
|
||||
max_lsn,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
|
||||
write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
|
||||
}
|
||||
@@ -545,10 +602,6 @@ impl InMemoryLayer {
|
||||
Ok(inner.file.len())
|
||||
}
|
||||
|
||||
pub fn estimated_in_mem_size(&self) -> u64 {
|
||||
self.estimated_in_mem_size.load(AtomicOrdering::Relaxed)
|
||||
}
|
||||
|
||||
/// Create a new, empty, in-memory layer
|
||||
pub async fn create(
|
||||
conf: &'static PageServerConf,
|
||||
@@ -578,7 +631,6 @@ impl InMemoryLayer {
|
||||
file,
|
||||
resource_units: GlobalResourceUnits::new(),
|
||||
}),
|
||||
estimated_in_mem_size: AtomicU64::new(0),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -589,7 +641,7 @@ impl InMemoryLayer {
|
||||
/// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
|
||||
pub async fn put_batch(
|
||||
&self,
|
||||
serialized_batch: SerializedValueBatch,
|
||||
serialized_batch: SerializedBatch,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
let mut inner = self.inner.write().await;
|
||||
@@ -597,13 +649,27 @@ impl InMemoryLayer {
|
||||
|
||||
let base_offset = inner.file.len();
|
||||
|
||||
let SerializedValueBatch {
|
||||
let SerializedBatch {
|
||||
raw,
|
||||
metadata,
|
||||
mut offsets,
|
||||
max_lsn: _,
|
||||
len: _,
|
||||
} = serialized_batch;
|
||||
|
||||
// Add the base_offset to the batch's index entries which are relative to the batch start.
|
||||
for offset in &mut offsets {
|
||||
let IndexEntryUnpacked {
|
||||
will_init,
|
||||
len,
|
||||
pos,
|
||||
} = offset.index_entry.unpack();
|
||||
offset.index_entry = IndexEntry::new(IndexEntryNewArgs {
|
||||
base_offset,
|
||||
batch_offset: pos,
|
||||
len: len.into_usize(),
|
||||
will_init,
|
||||
})?;
|
||||
}
|
||||
|
||||
// Write the batch to the file
|
||||
inner.file.write_raw(&raw, ctx).await?;
|
||||
let new_size = inner.file.len();
|
||||
@@ -616,28 +682,12 @@ impl InMemoryLayer {
|
||||
assert_eq!(new_size, expected_new_len);
|
||||
|
||||
// Update the index with the new entries
|
||||
for meta in metadata {
|
||||
let SerializedValueMeta {
|
||||
key,
|
||||
lsn,
|
||||
batch_offset,
|
||||
len,
|
||||
will_init,
|
||||
} = match meta {
|
||||
ValueMeta::Serialized(ser) => ser,
|
||||
ValueMeta::Observed(_) => {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
// Add the base_offset to the batch's index entries which are relative to the batch start.
|
||||
let index_entry = IndexEntry::new(IndexEntryNewArgs {
|
||||
base_offset,
|
||||
batch_offset,
|
||||
len,
|
||||
will_init,
|
||||
})?;
|
||||
|
||||
for SerializedBatchOffset {
|
||||
key,
|
||||
lsn,
|
||||
index_entry,
|
||||
} in offsets
|
||||
{
|
||||
let vec_map = inner.index.entry(key).or_default();
|
||||
let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
|
||||
if old.is_some() {
|
||||
@@ -649,12 +699,6 @@ impl InMemoryLayer {
|
||||
// because this case is unexpected, and we would like tests to fail if this happens.
|
||||
warn!("Key {} at {} written twice at same LSN", key, lsn);
|
||||
}
|
||||
self.estimated_in_mem_size.fetch_add(
|
||||
(std::mem::size_of::<CompactKey>()
|
||||
+ std::mem::size_of::<Lsn>()
|
||||
+ std::mem::size_of::<IndexEntry>()) as u64,
|
||||
AtomicOrdering::Relaxed,
|
||||
);
|
||||
}
|
||||
|
||||
inner.resource_units.maybe_publish_size(new_size);
|
||||
|
||||
@@ -760,8 +760,8 @@ async fn evict_and_wait_does_not_wait_for_download() {
|
||||
/// Also checks that the same does not happen on a non-evicted layer (regression test).
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn eviction_cancellation_on_drop() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
// this is the runtime on which Layer spawns the blocking tasks on
|
||||
let handle = tokio::runtime::Handle::current();
|
||||
@@ -782,7 +782,7 @@ async fn eviction_cancellation_on_drop() {
|
||||
let mut writer = timeline.writer().await;
|
||||
writer
|
||||
.put(
|
||||
pageserver_api::key::Key::from_i128(5),
|
||||
crate::repository::Key::from_i128(5),
|
||||
Lsn(0x20),
|
||||
&Value::Image(Bytes::from_static(b"this does not matter either")),
|
||||
&ctx,
|
||||
|
||||
@@ -3,7 +3,7 @@ use pageserver_api::shard::TenantShardId;
|
||||
use std::ops::Range;
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
use pageserver_api::key::Key;
|
||||
use crate::repository::Key;
|
||||
|
||||
use super::{DeltaLayerName, ImageLayerName, LayerName};
|
||||
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
//!
|
||||
//! Helper functions for dealing with filenames of the image and delta layer files.
|
||||
//!
|
||||
use pageserver_api::key::Key;
|
||||
use crate::repository::Key;
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::Ordering;
|
||||
use std::fmt;
|
||||
use std::ops::Range;
|
||||
use std::str::FromStr;
|
||||
|
||||
use regex::Regex;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use super::PersistentLayerDesc;
|
||||
@@ -58,31 +60,32 @@ impl Ord for DeltaLayerName {
|
||||
/// Represents the region of the LSN-Key space covered by a DeltaLayer
|
||||
///
|
||||
/// ```text
|
||||
/// <key start>-<key end>__<LSN start>-<LSN end>-<generation>
|
||||
/// <key start>-<key end>__<LSN start>-<LSN end>
|
||||
/// ```
|
||||
impl DeltaLayerName {
|
||||
/// Parse the part of a delta layer's file name that represents the LayerName. Returns None
|
||||
/// if the filename does not match the expected pattern.
|
||||
pub fn parse_str(fname: &str) -> Option<Self> {
|
||||
let (key_parts, lsn_generation_parts) = fname.split_once("__")?;
|
||||
let (key_start_str, key_end_str) = key_parts.split_once('-')?;
|
||||
let (lsn_start_str, lsn_end_generation_parts) = lsn_generation_parts.split_once('-')?;
|
||||
let lsn_end_str = if let Some((lsn_end_str, maybe_generation)) =
|
||||
lsn_end_generation_parts.split_once('-')
|
||||
let mut parts = fname.split("__");
|
||||
let mut key_parts = parts.next()?.split('-');
|
||||
let mut lsn_parts = parts.next()?.split('-');
|
||||
|
||||
let key_start_str = key_parts.next()?;
|
||||
let key_end_str = key_parts.next()?;
|
||||
let lsn_start_str = lsn_parts.next()?;
|
||||
let lsn_end_str = lsn_parts.next()?;
|
||||
|
||||
if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() {
|
||||
return None;
|
||||
}
|
||||
|
||||
if key_start_str.len() != 36
|
||||
|| key_end_str.len() != 36
|
||||
|| lsn_start_str.len() != 16
|
||||
|| lsn_end_str.len() != 16
|
||||
{
|
||||
if maybe_generation.starts_with("v") {
|
||||
// vY-XXXXXXXX
|
||||
lsn_end_str
|
||||
} else if maybe_generation.len() == 8 {
|
||||
// XXXXXXXX
|
||||
lsn_end_str
|
||||
} else {
|
||||
// no idea what this is
|
||||
return None;
|
||||
}
|
||||
} else {
|
||||
lsn_end_generation_parts
|
||||
};
|
||||
return None;
|
||||
}
|
||||
|
||||
let key_start = Key::from_hex(key_start_str).ok()?;
|
||||
let key_end = Key::from_hex(key_end_str).ok()?;
|
||||
@@ -170,29 +173,25 @@ impl ImageLayerName {
|
||||
/// Represents the part of the Key-LSN space covered by an ImageLayer
|
||||
///
|
||||
/// ```text
|
||||
/// <key start>-<key end>__<LSN>-<generation>
|
||||
/// <key start>-<key end>__<LSN>
|
||||
/// ```
|
||||
impl ImageLayerName {
|
||||
/// Parse a string as then LayerName part of an image layer file name. Returns None if the
|
||||
/// filename does not match the expected pattern.
|
||||
pub fn parse_str(fname: &str) -> Option<Self> {
|
||||
let (key_parts, lsn_generation_parts) = fname.split_once("__")?;
|
||||
let (key_start_str, key_end_str) = key_parts.split_once('-')?;
|
||||
let lsn_str =
|
||||
if let Some((lsn_str, maybe_generation)) = lsn_generation_parts.split_once('-') {
|
||||
if maybe_generation.starts_with("v") {
|
||||
// vY-XXXXXXXX
|
||||
lsn_str
|
||||
} else if maybe_generation.len() == 8 {
|
||||
// XXXXXXXX
|
||||
lsn_str
|
||||
} else {
|
||||
// likely a delta layer
|
||||
return None;
|
||||
}
|
||||
} else {
|
||||
lsn_generation_parts
|
||||
};
|
||||
let mut parts = fname.split("__");
|
||||
let mut key_parts = parts.next()?.split('-');
|
||||
|
||||
let key_start_str = key_parts.next()?;
|
||||
let key_end_str = key_parts.next()?;
|
||||
let lsn_str = parts.next()?;
|
||||
if parts.next().is_some() || key_parts.next().is_some() {
|
||||
return None;
|
||||
}
|
||||
|
||||
if key_start_str.len() != 36 || key_end_str.len() != 36 || lsn_str.len() != 16 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let key_start = Key::from_hex(key_start_str).ok()?;
|
||||
let key_end = Key::from_hex(key_end_str).ok()?;
|
||||
@@ -259,14 +258,6 @@ impl LayerName {
|
||||
}
|
||||
}
|
||||
|
||||
/// Gets the LSN range encoded in the layer name.
|
||||
pub fn lsn_as_range(&self) -> Range<Lsn> {
|
||||
match &self {
|
||||
LayerName::Image(layer) => layer.lsn_as_range(),
|
||||
LayerName::Delta(layer) => layer.lsn_range.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn is_delta(&self) -> bool {
|
||||
matches!(self, LayerName::Delta(_))
|
||||
}
|
||||
@@ -299,8 +290,18 @@ impl FromStr for LayerName {
|
||||
/// Self. When loading a physical layer filename, we drop any extra information
|
||||
/// not needed to build Self.
|
||||
fn from_str(value: &str) -> Result<Self, Self::Err> {
|
||||
let delta = DeltaLayerName::parse_str(value);
|
||||
let image = ImageLayerName::parse_str(value);
|
||||
let gen_suffix_regex = Regex::new("^(?<base>.+)(?<gen>-v1-[0-9a-f]{8})$").unwrap();
|
||||
let file_name: Cow<str> = match gen_suffix_regex.captures(value) {
|
||||
Some(captures) => captures
|
||||
.name("base")
|
||||
.expect("Non-optional group")
|
||||
.as_str()
|
||||
.into(),
|
||||
None => value.into(),
|
||||
};
|
||||
|
||||
let delta = DeltaLayerName::parse_str(&file_name);
|
||||
let image = ImageLayerName::parse_str(&file_name);
|
||||
let ok = match (delta, image) {
|
||||
(None, None) => {
|
||||
return Err(format!(
|
||||
@@ -366,14 +367,11 @@ mod test {
|
||||
lsn: Lsn::from_hex("00000000014FED58").unwrap(),
|
||||
});
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap();
|
||||
assert_eq!(parsed, expected);
|
||||
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-00000001").unwrap();
|
||||
assert_eq!(parsed, expected);
|
||||
assert_eq!(parsed, expected,);
|
||||
|
||||
// Omitting generation suffix is valid
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap();
|
||||
assert_eq!(parsed, expected);
|
||||
assert_eq!(parsed, expected,);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -387,9 +385,6 @@ mod test {
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap();
|
||||
assert_eq!(parsed, expected);
|
||||
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-00000001").unwrap();
|
||||
assert_eq!(parsed, expected);
|
||||
|
||||
// Omitting generation suffix is valid
|
||||
let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap();
|
||||
assert_eq!(parsed, expected);
|
||||
|
||||
@@ -1,24 +1,21 @@
|
||||
use std::{
|
||||
cmp::Ordering,
|
||||
collections::{binary_heap, BinaryHeap},
|
||||
sync::Arc,
|
||||
};
|
||||
|
||||
use anyhow::bail;
|
||||
use pageserver_api::key::Key;
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use crate::context::RequestContext;
|
||||
use pageserver_api::value::Value;
|
||||
use crate::{context::RequestContext, repository::Value};
|
||||
|
||||
use super::{
|
||||
delta_layer::{DeltaLayerInner, DeltaLayerIterator},
|
||||
image_layer::{ImageLayerInner, ImageLayerIterator},
|
||||
PersistentLayerDesc, PersistentLayerKey,
|
||||
};
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub(crate) enum LayerRef<'a> {
|
||||
enum LayerRef<'a> {
|
||||
Image(&'a ImageLayerInner),
|
||||
Delta(&'a DeltaLayerInner),
|
||||
}
|
||||
@@ -64,20 +61,18 @@ impl LayerIterRef<'_> {
|
||||
/// 1. Unified iterator for image and delta layers.
|
||||
/// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
|
||||
/// 3. Lazy creation of the real delta/image iterator.
|
||||
pub(crate) enum IteratorWrapper<'a> {
|
||||
enum IteratorWrapper<'a> {
|
||||
NotLoaded {
|
||||
ctx: &'a RequestContext,
|
||||
first_key_lower_bound: (Key, Lsn),
|
||||
layer: LayerRef<'a>,
|
||||
source_desc: Arc<PersistentLayerKey>,
|
||||
},
|
||||
Loaded {
|
||||
iter: PeekableLayerIterRef<'a>,
|
||||
source_desc: Arc<PersistentLayerKey>,
|
||||
},
|
||||
}
|
||||
|
||||
pub(crate) struct PeekableLayerIterRef<'a> {
|
||||
struct PeekableLayerIterRef<'a> {
|
||||
iter: LayerIterRef<'a>,
|
||||
peeked: Option<(Key, Lsn, Value)>, // None == end
|
||||
}
|
||||
@@ -155,12 +150,6 @@ impl<'a> IteratorWrapper<'a> {
|
||||
layer: LayerRef::Image(image_layer),
|
||||
first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()),
|
||||
ctx,
|
||||
source_desc: PersistentLayerKey {
|
||||
key_range: image_layer.key_range().clone(),
|
||||
lsn_range: PersistentLayerDesc::image_layer_lsn_range(image_layer.lsn()),
|
||||
is_delta: false,
|
||||
}
|
||||
.into(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -172,18 +161,12 @@ impl<'a> IteratorWrapper<'a> {
|
||||
layer: LayerRef::Delta(delta_layer),
|
||||
first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start),
|
||||
ctx,
|
||||
source_desc: PersistentLayerKey {
|
||||
key_range: delta_layer.key_range().clone(),
|
||||
lsn_range: delta_layer.lsn_range().clone(),
|
||||
is_delta: true,
|
||||
}
|
||||
.into(),
|
||||
}
|
||||
}
|
||||
|
||||
fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> {
|
||||
match self {
|
||||
Self::Loaded { iter, .. } => iter
|
||||
Self::Loaded { iter } => iter
|
||||
.peek()
|
||||
.as_ref()
|
||||
.map(|(key, lsn, val)| (key, *lsn, Some(val))),
|
||||
@@ -207,7 +190,6 @@ impl<'a> IteratorWrapper<'a> {
|
||||
ctx,
|
||||
first_key_lower_bound,
|
||||
layer,
|
||||
source_desc,
|
||||
} = self
|
||||
else {
|
||||
unreachable!()
|
||||
@@ -223,10 +205,7 @@ impl<'a> IteratorWrapper<'a> {
|
||||
);
|
||||
}
|
||||
}
|
||||
*self = Self::Loaded {
|
||||
iter,
|
||||
source_desc: source_desc.clone(),
|
||||
};
|
||||
*self = Self::Loaded { iter };
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -240,19 +219,11 @@ impl<'a> IteratorWrapper<'a> {
|
||||
/// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and
|
||||
/// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
|
||||
async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
let Self::Loaded { iter, .. } = self else {
|
||||
let Self::Loaded { iter } = self else {
|
||||
panic!("must load the iterator before using")
|
||||
};
|
||||
iter.next().await
|
||||
}
|
||||
|
||||
/// Get the persistent layer key corresponding to this iterator
|
||||
fn trace_source(&self) -> Arc<PersistentLayerKey> {
|
||||
match self {
|
||||
Self::Loaded { source_desc, .. } => source_desc.clone(),
|
||||
Self::NotLoaded { source_desc, .. } => source_desc.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A merge iterator over delta/image layer iterators.
|
||||
@@ -270,32 +241,6 @@ pub struct MergeIterator<'a> {
|
||||
heap: BinaryHeap<IteratorWrapper<'a>>,
|
||||
}
|
||||
|
||||
pub(crate) trait MergeIteratorItem {
|
||||
fn new(item: (Key, Lsn, Value), iterator: &IteratorWrapper<'_>) -> Self;
|
||||
|
||||
fn key_lsn_value(&self) -> &(Key, Lsn, Value);
|
||||
}
|
||||
|
||||
impl MergeIteratorItem for (Key, Lsn, Value) {
|
||||
fn new(item: (Key, Lsn, Value), _: &IteratorWrapper<'_>) -> Self {
|
||||
item
|
||||
}
|
||||
|
||||
fn key_lsn_value(&self) -> &(Key, Lsn, Value) {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl MergeIteratorItem for ((Key, Lsn, Value), Arc<PersistentLayerKey>) {
|
||||
fn new(item: (Key, Lsn, Value), iter: &IteratorWrapper<'_>) -> Self {
|
||||
(item, iter.trace_source().clone())
|
||||
}
|
||||
|
||||
fn key_lsn_value(&self) -> &(Key, Lsn, Value) {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> MergeIterator<'a> {
|
||||
pub fn create(
|
||||
deltas: &[&'a DeltaLayerInner],
|
||||
@@ -314,7 +259,7 @@ impl<'a> MergeIterator<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn next_inner<R: MergeIteratorItem>(&mut self) -> anyhow::Result<Option<R>> {
|
||||
pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
while let Some(mut iter) = self.heap.peek_mut() {
|
||||
if !iter.is_loaded() {
|
||||
// Once we load the iterator, we can know the real first key-value pair in the iterator.
|
||||
@@ -329,22 +274,10 @@ impl<'a> MergeIterator<'a> {
|
||||
binary_heap::PeekMut::pop(iter);
|
||||
continue;
|
||||
};
|
||||
return Ok(Some(R::new(item, &iter)));
|
||||
return Ok(Some(item));
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// Get the next key-value pair from the iterator.
|
||||
pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
|
||||
self.next_inner().await
|
||||
}
|
||||
|
||||
/// Get the next key-value pair from the iterator, and trace where the key comes from.
|
||||
pub async fn next_with_trace(
|
||||
&mut self,
|
||||
) -> anyhow::Result<Option<((Key, Lsn, Value), Arc<PersistentLayerKey>)>> {
|
||||
self.next_inner().await
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -358,16 +291,12 @@ mod tests {
|
||||
use crate::{
|
||||
tenant::{
|
||||
harness::{TenantHarness, TIMELINE_ID},
|
||||
storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
|
||||
storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value},
|
||||
},
|
||||
walrecord::NeonWalRecord,
|
||||
DEFAULT_PG_VERSION,
|
||||
};
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
use crate::tenant::storage_layer::delta_layer::test::sort_delta_value;
|
||||
#[cfg(feature = "testing")]
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
|
||||
async fn assert_merge_iter_equal(
|
||||
merge_iter: &mut MergeIterator<'_>,
|
||||
expect: &[(Key, Lsn, Value)],
|
||||
@@ -390,8 +319,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn merge_in_between() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_merge_in_between")
|
||||
.await
|
||||
@@ -455,8 +384,8 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn delta_merge() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_delta_merge")
|
||||
.await
|
||||
@@ -529,11 +458,10 @@ mod tests {
|
||||
// TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
#[tokio::test]
|
||||
async fn delta_image_mixed_merge() {
|
||||
use crate::repository::Value;
|
||||
use bytes::Bytes;
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge")
|
||||
.await
|
||||
@@ -658,6 +586,5 @@ mod tests {
|
||||
is_send(merge_iter);
|
||||
}
|
||||
|
||||
#[cfg(feature = "testing")]
|
||||
fn is_send(_: impl Send) {}
|
||||
}
|
||||
|
||||
@@ -279,7 +279,6 @@ fn log_compaction_error(
|
||||
|
||||
let decision = match e {
|
||||
ShuttingDown => None,
|
||||
Offload(_) => Some(LooksLike::Error),
|
||||
_ if task_cancelled => Some(LooksLike::Info),
|
||||
Other(e) => {
|
||||
let root_cause = e.root_cause();
|
||||
|
||||
@@ -20,13 +20,11 @@ use chrono::{DateTime, Utc};
|
||||
use enumset::EnumSet;
|
||||
use fail::fail_point;
|
||||
use handle::ShardTimelineId;
|
||||
use offload::OffloadError;
|
||||
use once_cell::sync::Lazy;
|
||||
use pageserver_api::{
|
||||
config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
|
||||
key::{
|
||||
KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
|
||||
NON_INHERITED_SPARSE_RANGE,
|
||||
CompactKey, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
|
||||
NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
|
||||
},
|
||||
keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
|
||||
models::{
|
||||
@@ -50,7 +48,6 @@ use utils::{
|
||||
fs_ext, pausable_failpoint,
|
||||
sync::gate::{Gate, GateGuard},
|
||||
};
|
||||
use wal_decoder::serialized_batch::SerializedValueBatch;
|
||||
|
||||
use std::sync::atomic::Ordering as AtomicOrdering;
|
||||
use std::sync::{Arc, Mutex, RwLock, Weak};
|
||||
@@ -128,11 +125,11 @@ use utils::{
|
||||
simple_rcu::{Rcu, RcuReadGuard},
|
||||
};
|
||||
|
||||
use crate::repository::GcResult;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::task_mgr;
|
||||
use crate::task_mgr::TaskKind;
|
||||
use crate::tenant::gc_result::GcResult;
|
||||
use crate::ZERO_PAGE;
|
||||
use pageserver_api::key::Key;
|
||||
|
||||
use self::delete::DeleteTimelineFlow;
|
||||
pub(super) use self::eviction_task::EvictionTaskTenantState;
|
||||
@@ -142,7 +139,9 @@ use self::logical_size::LogicalSize;
|
||||
use self::walreceiver::{WalReceiver, WalReceiverConf};
|
||||
|
||||
use super::{
|
||||
config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized,
|
||||
config::TenantConf,
|
||||
storage_layer::{inmemory_layer, LayerVisibilityHint},
|
||||
upload_queue::NotInitialized,
|
||||
MaybeOffloaded,
|
||||
};
|
||||
use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
|
||||
@@ -156,9 +155,6 @@ use super::{
|
||||
GcError,
|
||||
};
|
||||
|
||||
#[cfg(test)]
|
||||
use pageserver_api::value::Value;
|
||||
|
||||
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
|
||||
pub(crate) enum FlushLoopState {
|
||||
NotStarted,
|
||||
@@ -853,10 +849,6 @@ pub(crate) enum ShutdownMode {
|
||||
/// While we are flushing, we continue to accept read I/O for LSNs ingested before
|
||||
/// the call to [`Timeline::shutdown`].
|
||||
FreezeAndFlush,
|
||||
/// Only flush the layers to the remote storage without freezing any open layers. This is the
|
||||
/// mode used by ancestor detach and any other operations that reloads a tenant but not increasing
|
||||
/// the generation number.
|
||||
Flush,
|
||||
/// Shut down immediately, without waiting for any open layers to flush.
|
||||
Hard,
|
||||
}
|
||||
@@ -1570,16 +1562,12 @@ impl Timeline {
|
||||
///
|
||||
/// This is neccessary but not sufficient for offloading of the timeline as it might have
|
||||
/// child timelines that are not offloaded yet.
|
||||
pub(crate) fn can_offload(&self) -> (bool, &'static str) {
|
||||
pub(crate) fn can_offload(&self) -> bool {
|
||||
if self.remote_client.is_archived() != Some(true) {
|
||||
return (false, "the timeline is not archived");
|
||||
}
|
||||
if !self.remote_client.no_pending_work() {
|
||||
// if the remote client is still processing some work, we can't offload
|
||||
return (false, "the upload queue is not drained yet");
|
||||
return false;
|
||||
}
|
||||
|
||||
(true, "ok")
|
||||
true
|
||||
}
|
||||
|
||||
/// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending
|
||||
@@ -1687,6 +1675,11 @@ impl Timeline {
|
||||
pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let try_freeze_and_flush = match mode {
|
||||
ShutdownMode::FreezeAndFlush => true,
|
||||
ShutdownMode::Hard => false,
|
||||
};
|
||||
|
||||
// Regardless of whether we're going to try_freeze_and_flush
|
||||
// or not, stop ingesting any more data. Walreceiver only provides
|
||||
// cancellation but no "wait until gone", because it uses the Timeline::gate.
|
||||
@@ -1708,7 +1701,7 @@ impl Timeline {
|
||||
// ... and inform any waiters for newer LSNs that there won't be any.
|
||||
self.last_record_lsn.shutdown();
|
||||
|
||||
if let ShutdownMode::FreezeAndFlush = mode {
|
||||
if try_freeze_and_flush {
|
||||
if let Some((open, frozen)) = self
|
||||
.layers
|
||||
.read()
|
||||
@@ -1750,20 +1743,6 @@ impl Timeline {
|
||||
warn!("failed to freeze and flush: {e:#}");
|
||||
}
|
||||
}
|
||||
|
||||
// `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
|
||||
// we also do a final check here to ensure that the queue is empty.
|
||||
if !self.remote_client.no_pending_work() {
|
||||
warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
|
||||
}
|
||||
}
|
||||
|
||||
if let ShutdownMode::Flush = mode {
|
||||
// drain the upload queue
|
||||
self.remote_client.shutdown().await;
|
||||
if !self.remote_client.no_pending_work() {
|
||||
warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
|
||||
}
|
||||
}
|
||||
|
||||
// Signal any subscribers to our cancellation token to drop out
|
||||
@@ -3506,37 +3485,18 @@ impl Timeline {
|
||||
|
||||
let timer = self.metrics.flush_time_histo.start_timer();
|
||||
|
||||
let num_frozen_layers;
|
||||
let frozen_layer_total_size;
|
||||
let layer_to_flush = {
|
||||
let guard = self.layers.read().await;
|
||||
let Ok(lm) = guard.layer_map() else {
|
||||
info!("dropping out of flush loop for timeline shutdown");
|
||||
return;
|
||||
};
|
||||
num_frozen_layers = lm.frozen_layers.len();
|
||||
frozen_layer_total_size = lm
|
||||
.frozen_layers
|
||||
.iter()
|
||||
.map(|l| l.estimated_in_mem_size())
|
||||
.sum::<u64>();
|
||||
lm.frozen_layers.front().cloned()
|
||||
// drop 'layers' lock to allow concurrent reads and writes
|
||||
};
|
||||
let Some(layer_to_flush) = layer_to_flush else {
|
||||
break Ok(());
|
||||
};
|
||||
if num_frozen_layers
|
||||
> std::cmp::max(
|
||||
self.get_compaction_threshold(),
|
||||
DEFAULT_COMPACTION_THRESHOLD,
|
||||
)
|
||||
&& frozen_layer_total_size >= /* 128 MB */ 128000000
|
||||
{
|
||||
tracing::warn!(
|
||||
"too many frozen layers: {num_frozen_layers} layers with estimated in-mem size of {frozen_layer_total_size} bytes",
|
||||
);
|
||||
}
|
||||
match self.flush_frozen_layer(layer_to_flush, ctx).await {
|
||||
Ok(this_layer_to_lsn) => {
|
||||
flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn);
|
||||
@@ -4127,7 +4087,6 @@ impl Timeline {
|
||||
) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
|
||||
// Metadata keys image layer creation.
|
||||
let mut reconstruct_state = ValuesReconstructState::default();
|
||||
let begin = Instant::now();
|
||||
let data = self
|
||||
.get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
|
||||
.await?;
|
||||
@@ -4144,11 +4103,14 @@ impl Timeline {
|
||||
(new_data, total_kb_retrieved / 1024, total_keys_retrieved)
|
||||
};
|
||||
let delta_files_accessed = reconstruct_state.get_delta_layers_visited();
|
||||
let elapsed = begin.elapsed();
|
||||
|
||||
let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
|
||||
info!(
|
||||
"metadata key compaction: trigger_generation={trigger_generation}, delta_files_accessed={delta_files_accessed}, total_kb_retrieved={total_kb_retrieved}, total_keys_retrieved={total_keys_retrieved}, read_time={}s", elapsed.as_secs_f64()
|
||||
debug!(
|
||||
trigger_generation,
|
||||
delta_files_accessed,
|
||||
total_kb_retrieved,
|
||||
total_keys_retrieved,
|
||||
"generate metadata images"
|
||||
);
|
||||
|
||||
if !trigger_generation && mode == ImageLayerCreationMode::Try {
|
||||
@@ -4512,23 +4474,11 @@ impl Drop for Timeline {
|
||||
pub(crate) enum CompactionError {
|
||||
#[error("The timeline or pageserver is shutting down")]
|
||||
ShuttingDown,
|
||||
/// Compaction tried to offload a timeline and failed
|
||||
#[error("Failed to offload timeline: {0}")]
|
||||
Offload(OffloadError),
|
||||
/// Compaction cannot be done right now; page reconstruction and so on.
|
||||
#[error(transparent)]
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<OffloadError> for CompactionError {
|
||||
fn from(e: OffloadError) -> Self {
|
||||
match e {
|
||||
OffloadError::Cancelled => Self::ShuttingDown,
|
||||
_ => Self::Offload(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CompactionError {
|
||||
pub fn is_cancelled(&self) -> bool {
|
||||
matches!(self, CompactionError::ShuttingDown)
|
||||
@@ -5772,22 +5722,23 @@ impl<'a> TimelineWriter<'a> {
|
||||
/// Put a batch of keys at the specified Lsns.
|
||||
pub(crate) async fn put_batch(
|
||||
&mut self,
|
||||
batch: SerializedValueBatch,
|
||||
batch: Vec<(CompactKey, Lsn, usize, Value)>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
if batch.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let batch_max_lsn = batch.max_lsn;
|
||||
let buf_size: u64 = batch.buffer_size() as u64;
|
||||
let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch)?;
|
||||
let batch_max_lsn = serialized_batch.max_lsn;
|
||||
let buf_size: u64 = serialized_batch.raw.len() as u64;
|
||||
|
||||
let action = self.get_open_layer_action(batch_max_lsn, buf_size);
|
||||
let layer = self
|
||||
.handle_open_layer_action(batch_max_lsn, action, ctx)
|
||||
.await?;
|
||||
|
||||
let res = layer.put_batch(batch, ctx).await;
|
||||
let res = layer.put_batch(serialized_batch, ctx).await;
|
||||
|
||||
if res.is_ok() {
|
||||
// Update the current size only when the entire write was ok.
|
||||
@@ -5822,14 +5773,11 @@ impl<'a> TimelineWriter<'a> {
|
||||
);
|
||||
}
|
||||
let val_ser_size = value.serialized_size().unwrap() as usize;
|
||||
let batch = SerializedValueBatch::from_values(vec![(
|
||||
key.to_compact(),
|
||||
lsn,
|
||||
val_ser_size,
|
||||
value.clone(),
|
||||
)]);
|
||||
|
||||
self.put_batch(batch, ctx).await
|
||||
self.put_batch(
|
||||
vec![(key.to_compact(), lsn, val_ser_size, value.clone())],
|
||||
ctx,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub(crate) async fn delete_batch(
|
||||
@@ -5874,15 +5822,17 @@ fn is_send() {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::value::Value;
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::tenant::{
|
||||
harness::{test_img, TenantHarness},
|
||||
layer_map::LayerMap,
|
||||
storage_layer::{Layer, LayerName},
|
||||
timeline::{DeltaLayerTestDesc, EvictionError},
|
||||
Timeline,
|
||||
use crate::{
|
||||
repository::Value,
|
||||
tenant::{
|
||||
harness::{test_img, TenantHarness},
|
||||
layer_map::LayerMap,
|
||||
storage_layer::{Layer, LayerName},
|
||||
timeline::{DeltaLayerTestDesc, EvictionError},
|
||||
Timeline,
|
||||
},
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
//!
|
||||
//! The old legacy algorithm is implemented directly in `timeline.rs`.
|
||||
|
||||
use std::collections::{BinaryHeap, HashMap, HashSet};
|
||||
use std::collections::{BinaryHeap, HashSet};
|
||||
use std::ops::{Deref, Range};
|
||||
use std::sync::Arc;
|
||||
|
||||
@@ -49,14 +49,13 @@ use pageserver_api::config::tenant_conf_defaults::{
|
||||
DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
|
||||
};
|
||||
|
||||
use pageserver_api::key::Key;
|
||||
use pageserver_api::keyspace::KeySpace;
|
||||
use pageserver_api::record::NeonWalRecord;
|
||||
use pageserver_api::value::Value;
|
||||
use crate::keyspace::KeySpace;
|
||||
use crate::repository::{Key, Value};
|
||||
use crate::walrecord::NeonWalRecord;
|
||||
|
||||
use utils::lsn::Lsn;
|
||||
|
||||
use pageserver_compaction::helpers::{fully_contains, overlaps_with};
|
||||
use pageserver_compaction::helpers::overlaps_with;
|
||||
use pageserver_compaction::interface::*;
|
||||
|
||||
use super::CompactionError;
|
||||
@@ -64,23 +63,6 @@ use super::CompactionError;
|
||||
/// Maximum number of deltas before generating an image layer in bottom-most compaction.
|
||||
const COMPACTION_DELTA_THRESHOLD: usize = 5;
|
||||
|
||||
pub struct GcCompactionJobDescription {
|
||||
/// All layers to read in the compaction job
|
||||
selected_layers: Vec<Layer>,
|
||||
/// GC cutoff of the job
|
||||
gc_cutoff: Lsn,
|
||||
/// LSNs to retain for the job
|
||||
retain_lsns_below_horizon: Vec<Lsn>,
|
||||
/// Maximum layer LSN processed in this compaction
|
||||
max_layer_lsn: Lsn,
|
||||
/// Only compact layers overlapping with this range
|
||||
compaction_key_range: Range<Key>,
|
||||
/// When partial compaction is enabled, these layers need to be rewritten to ensure no overlap.
|
||||
/// This field is here solely for debugging. The field will not be read once the compaction
|
||||
/// description is generated.
|
||||
rewrite_layers: Vec<Arc<PersistentLayerDesc>>,
|
||||
}
|
||||
|
||||
/// The result of bottom-most compaction for a single key at each LSN.
|
||||
#[derive(Debug)]
|
||||
#[cfg_attr(test, derive(PartialEq))]
|
||||
@@ -1733,36 +1715,20 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn compact_with_gc(
|
||||
self: &Arc<Self>,
|
||||
cancel: &CancellationToken,
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
self.partial_compact_with_gc(Key::MIN..Key::MAX, cancel, flags, ctx)
|
||||
.await
|
||||
}
|
||||
|
||||
/// An experimental compaction building block that combines compaction with garbage collection.
|
||||
///
|
||||
/// The current implementation picks all delta + image layers that are below or intersecting with
|
||||
/// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
|
||||
/// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
|
||||
/// and create delta layers with all deltas >= gc horizon.
|
||||
///
|
||||
/// If `key_range` is provided, it will only compact the keys within the range, aka partial compaction.
|
||||
/// Partial compaction will read and process all layers overlapping with the key range, even if it might
|
||||
/// contain extra keys. After the gc-compaction phase completes, delta layers that are not fully contained
|
||||
/// within the key range will be rewritten to ensure they do not overlap with the delta layers. Providing
|
||||
/// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
|
||||
/// part of the range.
|
||||
pub(crate) async fn partial_compact_with_gc(
|
||||
pub(crate) async fn compact_with_gc(
|
||||
self: &Arc<Self>,
|
||||
compaction_key_range: Range<Key>,
|
||||
cancel: &CancellationToken,
|
||||
flags: EnumSet<CompactFlags>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<()> {
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
// Block other compaction/GC tasks from running for now. GC-compaction could run along
|
||||
// with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
|
||||
// Note that we already acquired the compaction lock when the outer `compact` function gets called.
|
||||
@@ -1784,11 +1750,7 @@ impl Timeline {
|
||||
|
||||
let dry_run = flags.contains(CompactFlags::DryRun);
|
||||
|
||||
if compaction_key_range == (Key::MIN..Key::MAX) {
|
||||
info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compaction_key_range={}..{}", compaction_key_range.start, compaction_key_range.end);
|
||||
} else {
|
||||
info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
|
||||
}
|
||||
info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
|
||||
|
||||
scopeguard::defer! {
|
||||
info!("done enhanced gc bottom-most compaction");
|
||||
@@ -1800,7 +1762,7 @@ impl Timeline {
|
||||
// The layer selection has the following properties:
|
||||
// 1. If a layer is in the selection, all layers below it are in the selection.
|
||||
// 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
|
||||
let job_desc = {
|
||||
let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = {
|
||||
let guard = self.layers.read().await;
|
||||
let layers = guard.layer_map()?;
|
||||
let gc_info = self.gc_info.read().unwrap();
|
||||
@@ -1816,7 +1778,7 @@ impl Timeline {
|
||||
retain_lsns_below_horizon.push(*lsn);
|
||||
}
|
||||
}
|
||||
let mut selected_layers: Vec<Layer> = Vec::new();
|
||||
let mut selected_layers = Vec::new();
|
||||
drop(gc_info);
|
||||
// Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
|
||||
let Some(max_layer_lsn) = layers
|
||||
@@ -1830,21 +1792,9 @@ impl Timeline {
|
||||
};
|
||||
// Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
|
||||
// layers to compact.
|
||||
let mut rewrite_layers = Vec::new();
|
||||
for desc in layers.iter_historic_layers() {
|
||||
if desc.get_lsn_range().end <= max_layer_lsn
|
||||
&& overlaps_with(&desc.get_key_range(), &compaction_key_range)
|
||||
{
|
||||
// If the layer overlaps with the compaction key range, we need to read it to obtain all keys within the range,
|
||||
// even if it might contain extra keys
|
||||
if desc.get_lsn_range().end <= max_layer_lsn {
|
||||
selected_layers.push(guard.get_from_desc(&desc));
|
||||
// If the layer is not fully contained within the key range, we need to rewrite it if it's a delta layer (it's fine
|
||||
// to overlap image layers)
|
||||
if desc.is_delta()
|
||||
&& !fully_contains(&compaction_key_range, &desc.get_key_range())
|
||||
{
|
||||
rewrite_layers.push(desc);
|
||||
}
|
||||
}
|
||||
}
|
||||
if selected_layers.is_empty() {
|
||||
@@ -1852,88 +1802,72 @@ impl Timeline {
|
||||
return Ok(());
|
||||
}
|
||||
retain_lsns_below_horizon.sort();
|
||||
GcCompactionJobDescription {
|
||||
selected_layers,
|
||||
gc_cutoff,
|
||||
retain_lsns_below_horizon,
|
||||
max_layer_lsn,
|
||||
compaction_key_range,
|
||||
rewrite_layers,
|
||||
}
|
||||
(selected_layers, gc_cutoff, retain_lsns_below_horizon)
|
||||
};
|
||||
let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
|
||||
Lsn(self.ancestor_lsn.0 + 1)
|
||||
} else {
|
||||
let res = job_desc
|
||||
.retain_lsns_below_horizon
|
||||
let res = retain_lsns_below_horizon
|
||||
.first()
|
||||
.copied()
|
||||
.unwrap_or(job_desc.gc_cutoff);
|
||||
.unwrap_or(gc_cutoff);
|
||||
if cfg!(debug_assertions) {
|
||||
assert_eq!(
|
||||
res,
|
||||
job_desc
|
||||
.retain_lsns_below_horizon
|
||||
retain_lsns_below_horizon
|
||||
.iter()
|
||||
.min()
|
||||
.copied()
|
||||
.unwrap_or(job_desc.gc_cutoff)
|
||||
.unwrap_or(gc_cutoff)
|
||||
);
|
||||
}
|
||||
res
|
||||
};
|
||||
info!(
|
||||
"picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}",
|
||||
job_desc.selected_layers.len(),
|
||||
job_desc.rewrite_layers.len(),
|
||||
job_desc.max_layer_lsn,
|
||||
job_desc.gc_cutoff,
|
||||
lowest_retain_lsn,
|
||||
job_desc.compaction_key_range.start,
|
||||
job_desc.compaction_key_range.end
|
||||
"picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}",
|
||||
layer_selection.len(),
|
||||
gc_cutoff,
|
||||
lowest_retain_lsn
|
||||
);
|
||||
|
||||
for layer in &job_desc.selected_layers {
|
||||
debug!("read layer: {}", layer.layer_desc().key());
|
||||
}
|
||||
for layer in &job_desc.rewrite_layers {
|
||||
debug!("rewrite layer: {}", layer.key());
|
||||
}
|
||||
self.check_compaction_space(&layer_selection).await?;
|
||||
|
||||
self.check_compaction_space(&job_desc.selected_layers)
|
||||
.await?;
|
||||
|
||||
// Generate statistics for the compaction
|
||||
for layer in &job_desc.selected_layers {
|
||||
// Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
|
||||
// Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
|
||||
let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
|
||||
for layer in &layer_selection {
|
||||
let desc = layer.layer_desc();
|
||||
if desc.is_delta() {
|
||||
// ignore single-key layer files
|
||||
if desc.key_range.start.next() != desc.key_range.end {
|
||||
let lsn_range = &desc.lsn_range;
|
||||
lsn_split_point.insert(lsn_range.start);
|
||||
lsn_split_point.insert(lsn_range.end);
|
||||
}
|
||||
stat.visit_delta_layer(desc.file_size());
|
||||
} else {
|
||||
stat.visit_image_layer(desc.file_size());
|
||||
}
|
||||
}
|
||||
|
||||
// Step 1: construct a k-merge iterator over all layers.
|
||||
// Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
|
||||
let layer_names = job_desc
|
||||
.selected_layers
|
||||
let layer_names: Vec<crate::tenant::storage_layer::LayerName> = layer_selection
|
||||
.iter()
|
||||
.map(|layer| layer.layer_desc().layer_name())
|
||||
.collect_vec();
|
||||
if let Some(err) = check_valid_layermap(&layer_names) {
|
||||
warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
|
||||
bail!("cannot run gc-compaction because {}", err);
|
||||
}
|
||||
// The maximum LSN we are processing in this compaction loop
|
||||
let end_lsn = job_desc
|
||||
.selected_layers
|
||||
let end_lsn = layer_selection
|
||||
.iter()
|
||||
.map(|l| l.layer_desc().lsn_range.end)
|
||||
.max()
|
||||
.unwrap();
|
||||
// We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
|
||||
// as an L0 layer.
|
||||
let mut delta_layers = Vec::new();
|
||||
let mut image_layers = Vec::new();
|
||||
let mut downloaded_layers = Vec::new();
|
||||
for layer in &job_desc.selected_layers {
|
||||
for layer in &layer_selection {
|
||||
let resident_layer = layer.download_and_keep_resident().await?;
|
||||
downloaded_layers.push(resident_layer);
|
||||
}
|
||||
@@ -1952,8 +1886,8 @@ impl Timeline {
|
||||
dense_ks,
|
||||
sparse_ks,
|
||||
)?;
|
||||
|
||||
// Step 2: Produce images+deltas.
|
||||
// Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
|
||||
// Data of the same key.
|
||||
let mut accumulated_values = Vec::new();
|
||||
let mut last_key: Option<Key> = None;
|
||||
|
||||
@@ -1965,7 +1899,7 @@ impl Timeline {
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
job_desc.compaction_key_range.start,
|
||||
Key::MIN,
|
||||
lowest_retain_lsn,
|
||||
self.get_compaction_target_size(),
|
||||
ctx,
|
||||
@@ -1985,13 +1919,6 @@ impl Timeline {
|
||||
)
|
||||
.await?;
|
||||
|
||||
#[derive(Default)]
|
||||
struct RewritingLayers {
|
||||
before: Option<DeltaLayerWriter>,
|
||||
after: Option<DeltaLayerWriter>,
|
||||
}
|
||||
let mut delta_layer_rewriters = HashMap::<Arc<PersistentLayerKey>, RewritingLayers>::new();
|
||||
|
||||
/// Returns None if there is no ancestor branch. Throw an error when the key is not found.
|
||||
///
|
||||
/// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
|
||||
@@ -2017,51 +1944,10 @@ impl Timeline {
|
||||
// the key and LSN range are determined. However, to keep things simple here, we still
|
||||
// create this writer, and discard the writer in the end.
|
||||
|
||||
while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? {
|
||||
while let Some((key, lsn, val)) = merge_iter.next().await? {
|
||||
if cancel.is_cancelled() {
|
||||
return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
|
||||
}
|
||||
if !job_desc.compaction_key_range.contains(&key) {
|
||||
if !desc.is_delta {
|
||||
continue;
|
||||
}
|
||||
let rewriter = delta_layer_rewriters.entry(desc.clone()).or_default();
|
||||
let rewriter = if key < job_desc.compaction_key_range.start {
|
||||
if rewriter.before.is_none() {
|
||||
rewriter.before = Some(
|
||||
DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
desc.key_range.start,
|
||||
desc.lsn_range.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
}
|
||||
rewriter.before.as_mut().unwrap()
|
||||
} else if key >= job_desc.compaction_key_range.end {
|
||||
if rewriter.after.is_none() {
|
||||
rewriter.after = Some(
|
||||
DeltaLayerWriter::new(
|
||||
self.conf,
|
||||
self.timeline_id,
|
||||
self.tenant_shard_id,
|
||||
job_desc.compaction_key_range.end,
|
||||
desc.lsn_range.clone(),
|
||||
ctx,
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
}
|
||||
rewriter.after.as_mut().unwrap()
|
||||
} else {
|
||||
unreachable!()
|
||||
};
|
||||
rewriter.put_value(key, lsn, val, ctx).await?;
|
||||
continue;
|
||||
}
|
||||
match val {
|
||||
Value::Image(_) => stat.visit_image_key(&val),
|
||||
Value::WalRecord(_) => stat.visit_wal_key(&val),
|
||||
@@ -2072,18 +1958,19 @@ impl Timeline {
|
||||
}
|
||||
accumulated_values.push((key, lsn, val));
|
||||
} else {
|
||||
let last_key: &mut Key = last_key.as_mut().unwrap();
|
||||
stat.on_unique_key_visited(); // TODO: adjust statistics for partial compaction
|
||||
let last_key = last_key.as_mut().unwrap();
|
||||
stat.on_unique_key_visited();
|
||||
let retention = self
|
||||
.generate_key_retention(
|
||||
*last_key,
|
||||
&accumulated_values,
|
||||
job_desc.gc_cutoff,
|
||||
&job_desc.retain_lsns_below_horizon,
|
||||
gc_cutoff,
|
||||
&retain_lsns_below_horizon,
|
||||
COMPACTION_DELTA_THRESHOLD,
|
||||
get_ancestor_image(self, *last_key, ctx).await?,
|
||||
)
|
||||
.await?;
|
||||
// Put the image into the image layer. Currently we have a single big layer for the compaction.
|
||||
retention
|
||||
.pipe_to(
|
||||
*last_key,
|
||||
@@ -2099,20 +1986,20 @@ impl Timeline {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: move the below part to the loop body
|
||||
let last_key = last_key.expect("no keys produced during compaction");
|
||||
// TODO: move this part to the loop body
|
||||
stat.on_unique_key_visited();
|
||||
|
||||
let retention = self
|
||||
.generate_key_retention(
|
||||
last_key,
|
||||
&accumulated_values,
|
||||
job_desc.gc_cutoff,
|
||||
&job_desc.retain_lsns_below_horizon,
|
||||
gc_cutoff,
|
||||
&retain_lsns_below_horizon,
|
||||
COMPACTION_DELTA_THRESHOLD,
|
||||
get_ancestor_image(self, last_key, ctx).await?,
|
||||
)
|
||||
.await?;
|
||||
// Put the image into the image layer. Currently we have a single big layer for the compaction.
|
||||
retention
|
||||
.pipe_to(
|
||||
last_key,
|
||||
@@ -2122,23 +2009,6 @@ impl Timeline {
|
||||
ctx,
|
||||
)
|
||||
.await?;
|
||||
// end: move the above part to the loop body
|
||||
|
||||
let mut rewrote_delta_layers = Vec::new();
|
||||
for (key, writers) in delta_layer_rewriters {
|
||||
if let Some(delta_writer_before) = writers.before {
|
||||
let (desc, path) = delta_writer_before
|
||||
.finish(job_desc.compaction_key_range.start, ctx)
|
||||
.await?;
|
||||
let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
|
||||
rewrote_delta_layers.push(layer);
|
||||
}
|
||||
if let Some(delta_writer_after) = writers.after {
|
||||
let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?;
|
||||
let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
|
||||
rewrote_delta_layers.push(layer);
|
||||
}
|
||||
}
|
||||
|
||||
let discard = |key: &PersistentLayerKey| {
|
||||
let key = key.clone();
|
||||
@@ -2147,9 +2017,8 @@ impl Timeline {
|
||||
|
||||
let produced_image_layers = if let Some(writer) = image_layer_writer {
|
||||
if !dry_run {
|
||||
let end_key = job_desc.compaction_key_range.end;
|
||||
writer
|
||||
.finish_with_discard_fn(self, ctx, end_key, discard)
|
||||
.finish_with_discard_fn(self, ctx, Key::MAX, discard)
|
||||
.await?
|
||||
} else {
|
||||
drop(writer);
|
||||
@@ -2168,8 +2037,6 @@ impl Timeline {
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
// TODO: make image/delta/rewrote_delta layers generation atomic. At this point, we already generated resident layers, and if
|
||||
// compaction is cancelled at this point, we might have some layers that are not cleaned up.
|
||||
let mut compact_to = Vec::new();
|
||||
let mut keep_layers = HashSet::new();
|
||||
let produced_delta_layers_len = produced_delta_layers.len();
|
||||
@@ -2177,82 +2044,28 @@ impl Timeline {
|
||||
for action in produced_delta_layers {
|
||||
match action {
|
||||
BatchWriterResult::Produced(layer) => {
|
||||
if cfg!(debug_assertions) {
|
||||
info!("produced delta layer: {}", layer.layer_desc().key());
|
||||
}
|
||||
stat.produce_delta_layer(layer.layer_desc().file_size());
|
||||
compact_to.push(layer);
|
||||
}
|
||||
BatchWriterResult::Discarded(l) => {
|
||||
if cfg!(debug_assertions) {
|
||||
info!("discarded delta layer: {}", l);
|
||||
}
|
||||
keep_layers.insert(l);
|
||||
stat.discard_delta_layer();
|
||||
}
|
||||
}
|
||||
}
|
||||
for layer in &rewrote_delta_layers {
|
||||
debug!(
|
||||
"produced rewritten delta layer: {}",
|
||||
layer.layer_desc().key()
|
||||
);
|
||||
}
|
||||
compact_to.extend(rewrote_delta_layers);
|
||||
for action in produced_image_layers {
|
||||
match action {
|
||||
BatchWriterResult::Produced(layer) => {
|
||||
debug!("produced image layer: {}", layer.layer_desc().key());
|
||||
stat.produce_image_layer(layer.layer_desc().file_size());
|
||||
compact_to.push(layer);
|
||||
}
|
||||
BatchWriterResult::Discarded(l) => {
|
||||
debug!("discarded image layer: {}", l);
|
||||
keep_layers.insert(l);
|
||||
stat.discard_image_layer();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut layer_selection = job_desc.selected_layers;
|
||||
|
||||
// Partial compaction might select more data than it processes, e.g., if
|
||||
// the compaction_key_range only partially overlaps:
|
||||
//
|
||||
// [---compaction_key_range---]
|
||||
// [---A----][----B----][----C----][----D----]
|
||||
//
|
||||
// For delta layers, we will rewrite the layers so that it is cut exactly at
|
||||
// the compaction key range, so we can always discard them. However, for image
|
||||
// layers, as we do not rewrite them for now, we need to handle them differently.
|
||||
// Assume image layers A, B, C, D are all in the `layer_selection`.
|
||||
//
|
||||
// The created image layers contain whatever is needed from B, C, and from
|
||||
// `----]` of A, and from `[---` of D.
|
||||
//
|
||||
// In contrast, `[---A` and `D----]` have not been processed, so, we must
|
||||
// keep that data.
|
||||
//
|
||||
// The solution for now is to keep A and D completely if they are image layers.
|
||||
// (layer_selection is what we'll remove from the layer map, so, retain what
|
||||
// is _not_ fully covered by compaction_key_range).
|
||||
for layer in &layer_selection {
|
||||
if !layer.layer_desc().is_delta() {
|
||||
if !overlaps_with(
|
||||
&layer.layer_desc().key_range,
|
||||
&job_desc.compaction_key_range,
|
||||
) {
|
||||
bail!("violated constraint: image layer outside of compaction key range");
|
||||
}
|
||||
if !fully_contains(
|
||||
&job_desc.compaction_key_range,
|
||||
&layer.layer_desc().key_range,
|
||||
) {
|
||||
keep_layers.insert(layer.layer_desc().key());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut layer_selection = layer_selection;
|
||||
layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
|
||||
|
||||
info!(
|
||||
@@ -2273,7 +2086,6 @@ impl Timeline {
|
||||
|
||||
// Step 3: Place back to the layer map.
|
||||
{
|
||||
// TODO: sanity check if the layer map is valid (i.e., should not have overlaps)
|
||||
let mut guard = self.layers.write().await;
|
||||
guard
|
||||
.open_mut()?
|
||||
@@ -2336,7 +2148,7 @@ struct ResidentDeltaLayer(ResidentLayer);
|
||||
struct ResidentImageLayer(ResidentLayer);
|
||||
|
||||
impl CompactionJobExecutor for TimelineAdaptor {
|
||||
type Key = pageserver_api::key::Key;
|
||||
type Key = crate::repository::Key;
|
||||
|
||||
type Layer = OwnArc<PersistentLayerDesc>;
|
||||
type DeltaLayer = ResidentDeltaLayer;
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::{
|
||||
use anyhow::Context;
|
||||
use pageserver_api::{models::TimelineState, shard::TenantShardId};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tracing::{error, info, info_span, instrument, Instrument};
|
||||
use tracing::{error, info, instrument, Instrument};
|
||||
use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
|
||||
|
||||
use crate::{
|
||||
@@ -14,11 +14,11 @@ use crate::{
|
||||
task_mgr::{self, TaskKind},
|
||||
tenant::{
|
||||
metadata::TimelineMetadata,
|
||||
remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
|
||||
CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
|
||||
TimelineOrOffloaded,
|
||||
remote_timeline_client::{
|
||||
self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
|
||||
},
|
||||
CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded,
|
||||
},
|
||||
virtual_file::MaybeFatalIo,
|
||||
};
|
||||
|
||||
use super::{Timeline, TimelineResources};
|
||||
@@ -63,10 +63,10 @@ pub(super) async fn delete_local_timeline_directory(
|
||||
conf: &PageServerConf,
|
||||
tenant_shard_id: TenantShardId,
|
||||
timeline: &Timeline,
|
||||
) {
|
||||
) -> anyhow::Result<()> {
|
||||
// Always ensure the lock order is compaction -> gc.
|
||||
let compaction_lock = timeline.compaction_lock.lock();
|
||||
let _compaction_lock = crate::timed(
|
||||
let compaction_lock = crate::timed(
|
||||
compaction_lock,
|
||||
"acquires compaction lock",
|
||||
std::time::Duration::from_secs(5),
|
||||
@@ -74,7 +74,7 @@ pub(super) async fn delete_local_timeline_directory(
|
||||
.await;
|
||||
|
||||
let gc_lock = timeline.gc_lock.lock();
|
||||
let _gc_lock = crate::timed(
|
||||
let gc_lock = crate::timed(
|
||||
gc_lock,
|
||||
"acquires gc lock",
|
||||
std::time::Duration::from_secs(5),
|
||||
@@ -86,15 +86,24 @@ pub(super) async fn delete_local_timeline_directory(
|
||||
|
||||
let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id);
|
||||
|
||||
fail::fail_point!("timeline-delete-before-rm", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
|
||||
});
|
||||
|
||||
// NB: This need not be atomic because the deleted flag in the IndexPart
|
||||
// will be observed during tenant/timeline load. The deletion will be resumed there.
|
||||
//
|
||||
// ErrorKind::NotFound can happen e.g. if we race with tenant detach, because,
|
||||
// Note that here we do not bail out on std::io::ErrorKind::NotFound.
|
||||
// This can happen if we're called a second time, e.g.,
|
||||
// because of a previous failure/cancellation at/after
|
||||
// failpoint timeline-delete-after-rm.
|
||||
//
|
||||
// ErrorKind::NotFound can also happen if we race with tenant detach, because,
|
||||
// no locks are shared.
|
||||
tokio::fs::remove_dir_all(local_timeline_directory)
|
||||
.await
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.fatal_err("removing timeline directory");
|
||||
.context("remove local timeline directory")?;
|
||||
|
||||
// Make sure previous deletions are ordered before mark removal.
|
||||
// Otherwise there is no guarantee that they reach the disk before mark deletion.
|
||||
@@ -105,9 +114,17 @@ pub(super) async fn delete_local_timeline_directory(
|
||||
let timeline_path = conf.timelines_path(&tenant_shard_id);
|
||||
crashsafe::fsync_async(timeline_path)
|
||||
.await
|
||||
.fatal_err("fsync after removing timeline directory");
|
||||
.context("fsync_pre_mark_remove")?;
|
||||
|
||||
info!("finished deleting layer files, releasing locks");
|
||||
drop(gc_lock);
|
||||
drop(compaction_lock);
|
||||
|
||||
fail::fail_point!("timeline-delete-after-rm", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Removes remote layers and an index file after them.
|
||||
@@ -159,6 +176,32 @@ async fn remove_maybe_offloaded_timeline_from_tenant(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// It is important that this gets called when DeletionGuard is being held.
|
||||
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
|
||||
async fn upload_new_tenant_manifest(
|
||||
tenant: &Tenant,
|
||||
_: &DeletionGuard, // using it as a witness
|
||||
) -> anyhow::Result<()> {
|
||||
// This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
|
||||
// between the deletion of the index-part.json and reaching of this code.
|
||||
// So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
|
||||
// However, we handle this case in tenant loading code so the next time we attach, the issue is
|
||||
// resolved.
|
||||
let manifest = tenant.tenant_manifest();
|
||||
// TODO: generation support
|
||||
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
|
||||
remote_timeline_client::upload_tenant_manifest(
|
||||
&tenant.remote_storage,
|
||||
&tenant.tenant_shard_id,
|
||||
generation,
|
||||
&manifest,
|
||||
&tenant.cancel,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
|
||||
/// and deletes its data from both disk and s3.
|
||||
/// The sequence of steps:
|
||||
@@ -198,8 +241,7 @@ impl DeleteTimelineFlow {
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
super::debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
|
||||
let allow_offloaded_children = false;
|
||||
let (timeline, mut guard) = Self::prepare(tenant, timeline_id, allow_offloaded_children)?;
|
||||
let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
|
||||
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
@@ -216,32 +258,7 @@ impl DeleteTimelineFlow {
|
||||
))?
|
||||
});
|
||||
|
||||
let remote_client = match timeline.maybe_remote_client() {
|
||||
Some(remote_client) => remote_client,
|
||||
None => {
|
||||
let remote_client = tenant
|
||||
.build_timeline_client(timeline.timeline_id(), tenant.remote_storage.clone());
|
||||
let result = remote_client
|
||||
.download_index_file(&tenant.cancel)
|
||||
.instrument(info_span!("download_index_file"))
|
||||
.await
|
||||
.map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!("error: {:?}", e)))?;
|
||||
let index_part = match result {
|
||||
MaybeDeletedIndexPart::Deleted(p) => {
|
||||
tracing::info!("Timeline already set as deleted in remote index");
|
||||
p
|
||||
}
|
||||
MaybeDeletedIndexPart::IndexPart(p) => p,
|
||||
};
|
||||
let remote_client = Arc::new(remote_client);
|
||||
|
||||
remote_client
|
||||
.init_upload_queue(&index_part)
|
||||
.map_err(DeleteTimelineError::Other)?;
|
||||
remote_client.shutdown().await;
|
||||
remote_client
|
||||
}
|
||||
};
|
||||
let remote_client = timeline.remote_client_maybe_construct(tenant);
|
||||
set_deleted_in_remote_index(&remote_client).await?;
|
||||
|
||||
fail::fail_point!("timeline-delete-before-schedule", |_| {
|
||||
@@ -325,7 +342,6 @@ impl DeleteTimelineFlow {
|
||||
pub(super) fn prepare(
|
||||
tenant: &Tenant,
|
||||
timeline_id: TimelineId,
|
||||
allow_offloaded_children: bool,
|
||||
) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
|
||||
// Note the interaction between this guard and deletion guard.
|
||||
// Here we attempt to lock deletion guard when we're holding a lock on timelines.
|
||||
@@ -338,27 +354,30 @@ impl DeleteTimelineFlow {
|
||||
// T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
|
||||
// For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
|
||||
let timelines = tenant.timelines.lock().unwrap();
|
||||
let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
|
||||
|
||||
let timeline = match timelines.get(&timeline_id) {
|
||||
Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
|
||||
None => match timelines_offloaded.get(&timeline_id) {
|
||||
Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
|
||||
None => return Err(DeleteTimelineError::NotFound),
|
||||
},
|
||||
None => {
|
||||
let offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
|
||||
match offloaded_timelines.get(&timeline_id) {
|
||||
Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
|
||||
None => return Err(DeleteTimelineError::NotFound),
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Ensure that there are no child timelines, because we are about to remove files,
|
||||
// which will break child branches
|
||||
let mut children = Vec::new();
|
||||
if !allow_offloaded_children {
|
||||
children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| {
|
||||
(entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id)
|
||||
}));
|
||||
}
|
||||
children.extend(timelines.iter().filter_map(|(id, entry)| {
|
||||
(entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id)
|
||||
}));
|
||||
// Ensure that there are no child timelines **attached to that pageserver**,
|
||||
// because detach removes files, which will break child branches
|
||||
let children: Vec<TimelineId> = timelines
|
||||
.iter()
|
||||
.filter_map(|(id, entry)| {
|
||||
if entry.get_ancestor_timeline_id() == Some(timeline_id) {
|
||||
Some(*id)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
if !children.is_empty() {
|
||||
return Err(DeleteTimelineError::HasChildren(children));
|
||||
@@ -424,35 +443,19 @@ impl DeleteTimelineFlow {
|
||||
timeline: &TimelineOrOffloaded,
|
||||
remote_client: Arc<RemoteTimelineClient>,
|
||||
) -> Result<(), DeleteTimelineError> {
|
||||
fail::fail_point!("timeline-delete-before-rm", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
|
||||
});
|
||||
|
||||
// Offloaded timelines have no local state
|
||||
// TODO: once we persist offloaded information, delete the timeline from there, too
|
||||
if let TimelineOrOffloaded::Timeline(timeline) = timeline {
|
||||
delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await;
|
||||
delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
|
||||
}
|
||||
|
||||
fail::fail_point!("timeline-delete-after-rm", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
|
||||
});
|
||||
|
||||
delete_remote_layers_and_index(&remote_client).await?;
|
||||
|
||||
pausable_failpoint!("in_progress_delete");
|
||||
|
||||
remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?;
|
||||
|
||||
// This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
|
||||
// between the deletion of the index-part.json and reaching of this code.
|
||||
// So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
|
||||
// However, we handle this case in tenant loading code so the next time we attach, the issue is
|
||||
// resolved.
|
||||
tenant
|
||||
.store_tenant_manifest()
|
||||
.await
|
||||
.map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!(e)))?;
|
||||
upload_new_tenant_manifest(tenant, &guard).await?;
|
||||
|
||||
*guard = Self::Finished;
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::{
|
||||
virtual_file::{MaybeFatalIo, VirtualFile},
|
||||
};
|
||||
use anyhow::Context;
|
||||
use pageserver_api::{models::detach_ancestor::AncestorDetached, shard::ShardIdentity};
|
||||
use pageserver_api::models::detach_ancestor::AncestorDetached;
|
||||
use tokio::sync::Semaphore;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::Instrument;
|
||||
@@ -29,9 +29,6 @@ pub(crate) enum Error {
|
||||
#[error("shutting down, please retry later")]
|
||||
ShuttingDown,
|
||||
|
||||
#[error("archived: {}", .0)]
|
||||
Archived(TimelineId),
|
||||
|
||||
#[error(transparent)]
|
||||
NotFound(crate::tenant::GetTimelineError),
|
||||
|
||||
@@ -82,9 +79,8 @@ impl From<Error> for ApiError {
|
||||
fn from(value: Error) -> Self {
|
||||
match value {
|
||||
Error::NoAncestor => ApiError::Conflict(value.to_string()),
|
||||
Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{value}")),
|
||||
Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", value)),
|
||||
Error::ShuttingDown => ApiError::ShuttingDown,
|
||||
Error::Archived(_) => ApiError::BadRequest(anyhow::anyhow!("{value}")),
|
||||
Error::OtherTimelineDetachOngoing(_) | Error::FailedToReparentAll => {
|
||||
ApiError::ResourceUnavailable(value.to_string().into())
|
||||
}
|
||||
@@ -205,18 +201,12 @@ pub(super) async fn prepare(
|
||||
}));
|
||||
};
|
||||
|
||||
if detached.is_archived() != Some(false) {
|
||||
return Err(Archived(detached.timeline_id));
|
||||
}
|
||||
|
||||
if !ancestor_lsn.is_valid() {
|
||||
// rare case, probably wouldn't even load
|
||||
tracing::error!("ancestor is set, but ancestor_lsn is invalid, this timeline needs fixing");
|
||||
return Err(NoAncestor);
|
||||
}
|
||||
|
||||
check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?;
|
||||
|
||||
if ancestor.ancestor_timeline.is_some() {
|
||||
// non-technical requirement; we could flatten N ancestors just as easily but we chose
|
||||
// not to, at least initially
|
||||
@@ -376,14 +366,8 @@ pub(super) async fn prepare(
|
||||
tasks.spawn(
|
||||
async move {
|
||||
let _permit = limiter.acquire().await;
|
||||
let owned = remote_copy(
|
||||
&adopted,
|
||||
&timeline,
|
||||
timeline.generation,
|
||||
timeline.shard_identity,
|
||||
&timeline.cancel,
|
||||
)
|
||||
.await?;
|
||||
let owned =
|
||||
remote_copy(&adopted, &timeline, timeline.generation, &timeline.cancel).await?;
|
||||
tracing::info!(layer=%owned, "remote copied");
|
||||
Ok(owned)
|
||||
}
|
||||
@@ -635,7 +619,6 @@ async fn remote_copy(
|
||||
adopted: &Layer,
|
||||
adoptee: &Arc<Timeline>,
|
||||
generation: Generation,
|
||||
shard_identity: ShardIdentity,
|
||||
cancel: &CancellationToken,
|
||||
) -> Result<Layer, Error> {
|
||||
// depending if Layer::keep_resident we could hardlink
|
||||
@@ -643,7 +626,6 @@ async fn remote_copy(
|
||||
let mut metadata = adopted.metadata();
|
||||
debug_assert!(metadata.generation <= generation);
|
||||
metadata.generation = generation;
|
||||
metadata.shard = shard_identity.shard_index();
|
||||
|
||||
let owned = crate::tenant::storage_layer::Layer::for_evicted(
|
||||
adoptee.conf,
|
||||
@@ -968,36 +950,3 @@ where
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn check_no_archived_children_of_ancestor(
|
||||
tenant: &Tenant,
|
||||
detached: &Arc<Timeline>,
|
||||
ancestor: &Arc<Timeline>,
|
||||
ancestor_lsn: Lsn,
|
||||
) -> Result<(), Error> {
|
||||
let timelines = tenant.timelines.lock().unwrap();
|
||||
let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
|
||||
for timeline in reparentable_timelines(timelines.values(), detached, ancestor, ancestor_lsn) {
|
||||
if timeline.is_archived() == Some(true) {
|
||||
return Err(Error::Archived(timeline.timeline_id));
|
||||
}
|
||||
}
|
||||
for timeline_offloaded in timelines_offloaded.values() {
|
||||
if timeline_offloaded.ancestor_timeline_id != Some(ancestor.timeline_id) {
|
||||
continue;
|
||||
}
|
||||
// This forbids the detach ancestor feature if flattened timelines are present,
|
||||
// even if the ancestor_lsn is from after the branchpoint of the detached timeline.
|
||||
// But as per current design, we don't record the ancestor_lsn of flattened timelines.
|
||||
// This is a bit unfortunate, but as of writing this we don't support flattening
|
||||
// anyway. Maybe we can evolve the data model in the future.
|
||||
if let Some(retain_lsn) = timeline_offloaded.ancestor_retain_lsn {
|
||||
let is_earlier = retain_lsn <= ancestor_lsn;
|
||||
if !is_earlier {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return Err(Error::Archived(timeline_offloaded.timeline_id));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -3,40 +3,16 @@ use std::sync::Arc;
|
||||
use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
|
||||
use super::Timeline;
|
||||
use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
|
||||
use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
|
||||
|
||||
#[derive(thiserror::Error, Debug)]
|
||||
pub(crate) enum OffloadError {
|
||||
#[error("Cancelled")]
|
||||
Cancelled,
|
||||
#[error("Timeline is not archived")]
|
||||
NotArchived,
|
||||
#[error(transparent)]
|
||||
RemoteStorage(anyhow::Error),
|
||||
#[error("Unexpected offload error: {0}")]
|
||||
Other(anyhow::Error),
|
||||
}
|
||||
|
||||
impl From<TenantManifestError> for OffloadError {
|
||||
fn from(e: TenantManifestError) -> Self {
|
||||
match e {
|
||||
TenantManifestError::Cancelled => Self::Cancelled,
|
||||
TenantManifestError::RemoteStorage(e) => Self::RemoteStorage(e),
|
||||
}
|
||||
}
|
||||
}
|
||||
use crate::tenant::{remote_timeline_client, OffloadedTimeline, Tenant, TimelineOrOffloaded};
|
||||
|
||||
pub(crate) async fn offload_timeline(
|
||||
tenant: &Tenant,
|
||||
timeline: &Arc<Timeline>,
|
||||
) -> Result<(), OffloadError> {
|
||||
) -> anyhow::Result<()> {
|
||||
debug_assert_current_span_has_tenant_and_timeline_id();
|
||||
tracing::info!("offloading archived timeline");
|
||||
|
||||
let allow_offloaded_children = true;
|
||||
let (timeline, guard) =
|
||||
DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children)
|
||||
.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
|
||||
let (timeline, guard) = DeleteTimelineFlow::prepare(tenant, timeline.timeline_id)?;
|
||||
|
||||
let TimelineOrOffloaded::Timeline(timeline) = timeline else {
|
||||
tracing::error!("timeline already offloaded, but given timeline object");
|
||||
@@ -47,26 +23,28 @@ pub(crate) async fn offload_timeline(
|
||||
match is_archived {
|
||||
Some(true) => (),
|
||||
Some(false) => {
|
||||
tracing::warn!("tried offloading a non-archived timeline");
|
||||
return Err(OffloadError::NotArchived);
|
||||
tracing::warn!(?is_archived, "tried offloading a non-archived timeline");
|
||||
anyhow::bail!("timeline isn't archived");
|
||||
}
|
||||
None => {
|
||||
// This is legal: calls to this function can race with the timeline shutting down
|
||||
tracing::info!("tried offloading a timeline whose remote storage is not initialized");
|
||||
return Err(OffloadError::Cancelled);
|
||||
tracing::warn!(
|
||||
?is_archived,
|
||||
"tried offloading a timeline where manifest is not yet available"
|
||||
);
|
||||
anyhow::bail!("timeline manifest hasn't been loaded yet");
|
||||
}
|
||||
}
|
||||
|
||||
// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
|
||||
timeline.shutdown(super::ShutdownMode::Flush).await;
|
||||
timeline.shutdown(super::ShutdownMode::Hard).await;
|
||||
|
||||
// TODO extend guard mechanism above with method
|
||||
// to make deletions possible while offloading is in progress
|
||||
|
||||
let conf = &tenant.conf;
|
||||
delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await;
|
||||
delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await?;
|
||||
|
||||
remove_timeline_from_tenant(tenant, &timeline, &guard);
|
||||
remove_timeline_from_tenant(tenant, &timeline, &guard).await?;
|
||||
|
||||
{
|
||||
let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
|
||||
@@ -85,18 +63,28 @@ pub(crate) async fn offload_timeline(
|
||||
// at the next restart attach it again.
|
||||
// For that to happen, we'd need to make the manifest reflect our *intended* state,
|
||||
// not our actual state of offloaded timelines.
|
||||
tenant.store_tenant_manifest().await?;
|
||||
let manifest = tenant.tenant_manifest();
|
||||
// TODO: generation support
|
||||
let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
|
||||
remote_timeline_client::upload_tenant_manifest(
|
||||
&tenant.remote_storage,
|
||||
&tenant.tenant_shard_id,
|
||||
generation,
|
||||
&manifest,
|
||||
&tenant.cancel,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// It is important that this gets called when DeletionGuard is being held.
|
||||
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
|
||||
fn remove_timeline_from_tenant(
|
||||
async fn remove_timeline_from_tenant(
|
||||
tenant: &Tenant,
|
||||
timeline: &Timeline,
|
||||
_: &DeletionGuard, // using it as a witness
|
||||
) {
|
||||
) -> anyhow::Result<()> {
|
||||
// Remove the timeline from the map.
|
||||
let mut timelines = tenant.timelines.lock().unwrap();
|
||||
let children_exist = timelines
|
||||
@@ -112,4 +100,8 @@ fn remove_timeline_from_tenant(
|
||||
timelines
|
||||
.remove(&timeline.timeline_id)
|
||||
.expect("timeline that we were deleting was concurrently removed from 'timelines' map");
|
||||
|
||||
drop(timelines);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -141,9 +141,7 @@ impl Drop for UninitializedTimeline<'_> {
|
||||
fn drop(&mut self) {
|
||||
if let Some((_, create_guard)) = self.raw_timeline.take() {
|
||||
let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
|
||||
// This is unusual, but can happen harmlessly if the pageserver is stopped while
|
||||
// creating a timeline.
|
||||
info!("Timeline got dropped without initializing, cleaning its files");
|
||||
error!("Timeline got dropped without initializing, cleaning its files");
|
||||
cleanup_timeline_directory(create_guard);
|
||||
}
|
||||
}
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user