Merge pull request #8505 from neondatabase/rc/proxy/2024-07-25

Proxy release 2024-07-25
This commit is contained in:
Conrad Ludgate
2024-07-25 11:07:19 +01:00
committed by GitHub
215 changed files with 8582 additions and 4282 deletions

2
.gitattributes vendored Normal file
View File

@@ -0,0 +1,2 @@
# allows for nicer hunk headers with git show
*.rs diff=rust

View File

@@ -9,8 +9,8 @@ inputs:
description: 'Region ID, if not set the project will be created in the default region'
default: aws-us-east-2
postgres_version:
description: 'Postgres version; default is 15'
default: '15'
description: 'Postgres version; default is 16'
default: '16'
api_host:
description: 'Neon API host'
default: console-stage.neon.build

View File

@@ -0,0 +1,291 @@
name: Build and Test Locally
on:
workflow_call:
inputs:
arch:
description: 'x64 or arm64'
required: true
type: string
build-tag:
description: 'build tag'
required: true
type: string
build-tools-image:
description: 'build-tools image'
required: true
type: string
build-type:
description: 'debug or release'
required: true
type: string
defaults:
run:
shell: bash -euxo pipefail {0}
env:
RUST_BACKTRACE: 1
COPT: '-Werror'
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
jobs:
build-neon:
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
container:
image: ${{ inputs.build-tools-image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
# Raise locked memory limit for tokio-epoll-uring.
# On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
# io_uring will account the memory of the CQ and SQ as locked.
# More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
env:
BUILD_TYPE: ${{ inputs.build-type }}
GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
BUILD_TAG: ${{ inputs.build-tag }}
steps:
- name: Fix git ownership
run: |
# Workaround for `fatal: detected dubious ownership in repository at ...`
#
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
# Ref https://github.com/actions/checkout/issues/785
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 1
- name: Set pg 14 revision for caching
id: pg_v14_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
- name: Set pg 15 revision for caching
id: pg_v15_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
- name: Set pg 16 revision for caching
id: pg_v16_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
# Set some environment variables used by all the steps.
#
# CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
# It also includes --features, if any
#
# CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
# because "cargo metadata" doesn't accept --release or --debug options
#
# We run tests with addtional features, that are turned off by default (e.g. in release builds), see
# corresponding Cargo.toml files for their descriptions.
- name: Set env variables
run: |
CARGO_FEATURES="--features testing"
if [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
CARGO_FLAGS="--locked"
elif [[ $BUILD_TYPE == "release" ]]; then
cov_prefix=""
CARGO_FLAGS="--locked --release"
fi
{
echo "cov_prefix=${cov_prefix}"
echo "CARGO_FEATURES=${CARGO_FEATURES}"
echo "CARGO_FLAGS=${CARGO_FLAGS}"
echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
} >> $GITHUB_ENV
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v4
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v4
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v4
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
- name: Build postgres v14
if: steps.cache_pg_14.outputs.cache-hit != 'true'
run: mold -run make postgres-v14 -j$(nproc)
- name: Build postgres v15
if: steps.cache_pg_15.outputs.cache-hit != 'true'
run: mold -run make postgres-v15 -j$(nproc)
- name: Build postgres v16
if: steps.cache_pg_16.outputs.cache-hit != 'true'
run: mold -run make postgres-v16 -j$(nproc)
- name: Build neon extensions
run: mold -run make neon-pg-ext -j$(nproc)
- name: Build walproposer-lib
run: mold -run make walproposer-lib -j$(nproc)
- name: Run cargo build
run: |
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
export PQ_LIB_DIR
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
# Do install *before* running rust tests because they might recompile the
# binaries with different features/flags.
- name: Install rust binaries
run: |
# Install target binaries
mkdir -p /tmp/neon/bin/
binaries=$(
${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
)
for bin in $binaries; do
SRC=target/$BUILD_TYPE/$bin
DST=/tmp/neon/bin/$bin
cp "$SRC" "$DST"
done
# Install test executables and write list of all binaries (for code coverage)
if [[ $BUILD_TYPE == "debug" ]]; then
# Keep bloated coverage data files away from the rest of the artifact
mkdir -p /tmp/coverage/
mkdir -p /tmp/neon/test_bin/
test_exe_paths=$(
${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
jq -r '.executable | select(. != null)'
)
for bin in $test_exe_paths; do
SRC=$bin
DST=/tmp/neon/test_bin/$(basename $bin)
# We don't need debug symbols for code coverage, so strip them out to make
# the artifact smaller.
strip "$SRC" -o "$DST"
echo "$DST" >> /tmp/coverage/binaries.list
done
for bin in $binaries; do
echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
done
fi
- name: Run rust tests
env:
NEXTEST_RETRIES: 3
run: |
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
export PQ_LIB_DIR
LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
export LD_LIBRARY_PATH
#nextest does not yet support running doctests
cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
for io_engine in std-fs tokio-epoll-uring ; do
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
done
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
export REMOTE_STORAGE_S3_REGION=eu-central-1
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'
# Run separate tests for real Azure Blob Storage
# XXX: replace region with `eu-central-1`-like region
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
- name: Install postgres binaries
run: cp -a pg_install /tmp/neon/pg_install
- name: Upload Neon artifact
uses: ./.github/actions/upload
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
path: /tmp/neon
# XXX: keep this after the binaries.list is formed, so the coverage can properly work later
- name: Merge and upload coverage data
if: inputs.build-type == 'debug'
uses: ./.github/actions/save-coverage-data
regress-tests:
# Run test on x64 only
if: inputs.arch == 'x64'
needs: [ build-neon ]
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
container:
image: ${{ inputs.build-tools-image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
# for changed limits, see comments on `options:` earlier in this file
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
strategy:
fail-fast: false
matrix:
pg_version: [ v14, v15, v16 ]
steps:
- uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 1
- name: Pytest regression tests
uses: ./.github/actions/run-python-test-set
timeout-minutes: 60
with:
build_type: ${{ inputs.build-type }}
test_selection: regress
needs_postgres_source: true
run_with_real_s3: true
real_s3_bucket: neon-github-ci-tests
real_s3_region: eu-central-1
rerun_flaky: true
pg_version: ${{ matrix.pg_version }}
env:
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
BUILD_TAG: ${{ inputs.build-tag }}
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
PAGESERVER_GET_VECTORED_IMPL: vectored
PAGESERVER_GET_IMPL: vectored
PAGESERVER_VALIDATE_VEC_GET: true
# Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540
- name: Merge and upload coverage data
if: |
false &&
inputs.build-type == 'debug' && matrix.pg_version == 'v14'
uses: ./.github/actions/save-coverage-data

View File

@@ -56,15 +56,27 @@ concurrency:
jobs:
bench:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
strategy:
fail-fast: false
matrix:
include:
- DEFAULT_PG_VERSION: 16
PLATFORM: "neon-staging"
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
provisioner: 'k8s-pod'
- DEFAULT_PG_VERSION: 16
PLATFORM: "azure-staging"
region_id: 'azure-eastus2'
provisioner: 'k8s-neonvm'
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "300"
TEST_PG_BENCH_SCALES_MATRIX: "10,100"
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 14
DEFAULT_PG_VERSION: ${{ matrix.DEFAULT_PG_VERSION }}
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
PLATFORM: "neon-staging"
PLATFORM: ${{ matrix.PLATFORM }}
runs-on: [ self-hosted, us-east-2, x64 ]
container:
@@ -85,9 +97,10 @@ jobs:
id: create-neon-project
uses: ./.github/actions/neon-project-create
with:
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
region_id: ${{ matrix.region_id }}
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
provisioner: ${{ matrix.provisioner }}
- name: Run benchmark
uses: ./.github/actions/run-python-test-set
@@ -96,13 +109,14 @@ jobs:
test_selection: performance
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
pg_version: ${{ env.DEFAULT_PG_VERSION }}
# Set --sparse-ordering option of pytest-order plugin
# to ensure tests are running in order of appears in the file.
# It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
extra_params:
-m remote_cluster
--sparse-ordering
--timeout 5400
--timeout 14400
--ignore test_runner/performance/test_perf_olap.py
--ignore test_runner/performance/test_perf_pgvector_queries.py
--ignore test_runner/performance/test_logical_replication.py
@@ -133,6 +147,7 @@ jobs:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
replication-tests:
if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
env:
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 14
@@ -177,6 +192,7 @@ jobs:
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 5400
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -202,11 +218,14 @@ jobs:
# Available platforms:
# - neon-captest-new: Freshly created project (1 CU)
# - neon-captest-freetier: Use freetier-sized compute (0.25 CU)
# - neonvm-captest-azure-new: Freshly created project (1 CU) in azure region
# - neonvm-captest-azure-freetier: Use freetier-sized compute (0.25 CU) in azure region
# - neon-captest-reuse: Reusing existing project
# - rds-aurora: Aurora Postgres Serverless v2 with autoscaling from 0.5 to 2 ACUs
# - rds-postgres: RDS Postgres db.m5.large instance (2 vCPU, 8 GiB) with gp3 EBS storage
env:
RUN_AWS_RDS_AND_AURORA: ${{ github.event.inputs.run_AWS_RDS_AND_AURORA || 'false' }}
DEFAULT_REGION_ID: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
runs-on: ubuntu-22.04
outputs:
pgbench-compare-matrix: ${{ steps.pgbench-compare-matrix.outputs.matrix }}
@@ -217,23 +236,33 @@ jobs:
- name: Generate matrix for pgbench benchmark
id: pgbench-compare-matrix
run: |
region_id_default=${{ env.DEFAULT_REGION_ID }}
matrix='{
"pg_version" : [
16
],
"region_id" : [
"'"$region_id_default"'"
],
"platform": [
"neon-captest-new",
"neon-captest-reuse",
"neonvm-captest-new"
],
"db_size": [ "10gb" ],
"include": [{ "platform": "neon-captest-freetier", "db_size": "3gb" },
{ "platform": "neon-captest-new", "db_size": "50gb" },
{ "platform": "neonvm-captest-freetier", "db_size": "3gb" },
{ "platform": "neonvm-captest-new", "db_size": "50gb" },
{ "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
"include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-freetier", "db_size": "3gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neon-captest-new", "db_size": "50gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier", "db_size": "3gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new", "db_size": "50gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "10gb" },
{ "pg_version": 16, "region_id": "azure-eastus2", "platform": "neonvm-azure-captest-new", "db_size": "50gb" },
{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
}'
if [ "$(date +%A)" = "Saturday" ]; then
matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
{ "platform": "rds-aurora", "db_size": "50gb"}]')
matrix=$(echo "$matrix" | jq '.include += [{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-postgres", "db_size": "10gb"},
{ "pg_version": 14, "region_id": "'"$region_id_default"'", "platform": "rds-aurora", "db_size": "50gb"}]')
fi
echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -285,7 +314,7 @@ jobs:
TEST_PG_BENCH_DURATIONS_MATRIX: "60m"
TEST_PG_BENCH_SCALES_MATRIX: ${{ matrix.db_size }}
POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
DEFAULT_PG_VERSION: 14
DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
@@ -310,14 +339,14 @@ jobs:
prefix: latest
- name: Create Neon Project
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier", "neonvm-azure-captest-freetier", "neonvm-azure-captest-new"]'), matrix.platform)
id: create-neon-project
uses: ./.github/actions/neon-project-create
with:
region_id: ${{ github.event.inputs.region_id || 'aws-us-east-2' }}
region_id: ${{ matrix.region_id }}
postgres_version: ${{ env.DEFAULT_PG_VERSION }}
api_key: ${{ secrets.NEON_STAGING_API_KEY }}
compute_units: ${{ (matrix.platform == 'neon-captest-freetier' && '[0.25, 0.25]') || '[1, 1]' }}
compute_units: ${{ (contains(matrix.platform, 'captest-freetier') && '[0.25, 0.25]') || '[1, 1]' }}
provisioner: ${{ (contains(matrix.platform, 'neonvm-') && 'k8s-neonvm') || 'k8s-pod' }}
- name: Set up Connection String
@@ -330,7 +359,7 @@ jobs:
neonvm-captest-sharding-reuse)
CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
;;
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier | neonvm-azure-captest-new | neonvm-azure-captest-freetier)
CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
;;
rds-aurora)
@@ -355,6 +384,7 @@ jobs:
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_init
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -368,6 +398,7 @@ jobs:
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_simple_update
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -381,6 +412,7 @@ jobs:
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_pgbench_remote_select_only
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -407,6 +439,13 @@ jobs:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
pgbench-pgvector:
strategy:
fail-fast: false
matrix:
include:
- PLATFORM: "neon-captest-pgvector"
- PLATFORM: "azure-captest-pgvector"
env:
TEST_PG_BENCH_DURATIONS_MATRIX: "15m"
TEST_PG_BENCH_SCALES_MATRIX: "1"
@@ -414,8 +453,9 @@ jobs:
DEFAULT_PG_VERSION: 16
TEST_OUTPUT: /tmp/test_output
BUILD_TYPE: remote
LD_LIBRARY_PATH: /home/nonroot/pg/usr/lib/x86_64-linux-gnu
SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
PLATFORM: "neon-captest-pgvector"
PLATFORM: ${{ matrix.PLATFORM }}
runs-on: [ self-hosted, us-east-2, x64 ]
container:
@@ -425,17 +465,39 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Download Neon artifact
uses: ./.github/actions/download
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
path: /tmp/neon/
prefix: latest
# until https://github.com/neondatabase/neon/issues/8275 is fixed we temporarily install postgresql-16
# instead of using Neon artifacts containing pgbench
- name: Install postgresql-16 where pytest expects it
run: |
cd /home/nonroot
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/libpq5_16.3-1.pgdg110%2B1_amd64.deb
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-client-16_16.3-1.pgdg110%2B1_amd64.deb
wget -q https://apt.postgresql.org/pub/repos/apt/pool/main/p/postgresql-16/postgresql-16_16.3-1.pgdg110%2B1_amd64.deb
dpkg -x libpq5_16.3-1.pgdg110+1_amd64.deb pg
dpkg -x postgresql-client-16_16.3-1.pgdg110+1_amd64.deb pg
dpkg -x postgresql-16_16.3-1.pgdg110+1_amd64.deb pg
mkdir -p /tmp/neon/pg_install/v16/bin
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/pgbench /tmp/neon/pg_install/v16/bin/pgbench
ln -s /home/nonroot/pg/usr/lib/postgresql/16/bin/psql /tmp/neon/pg_install/v16/bin/psql
ln -s /home/nonroot/pg/usr/lib/x86_64-linux-gnu /tmp/neon/pg_install/v16/lib
/tmp/neon/pg_install/v16/bin/pgbench --version
/tmp/neon/pg_install/v16/bin/psql --version
- name: Set up Connection String
id: set-up-connstr
run: |
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
case "${PLATFORM}" in
neon-captest-pgvector)
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
;;
azure-captest-pgvector)
CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR_AZURE }}
;;
*)
echo >&2 "Unknown PLATFORM=${PLATFORM}"
exit 1
;;
esac
echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
@@ -447,6 +509,7 @@ jobs:
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_pgvector_indexing
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
@@ -460,6 +523,7 @@ jobs:
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -474,11 +538,10 @@ jobs:
uses: slackapi/slack-github-action@v1
with:
channel-id: "C033QLM5P7D" # dev-staging-stream
slack-message: "Periodic perf testing neon-captest-pgvector: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
slack-message: "Periodic perf testing ${PLATFORM}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
env:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
clickbench-compare:
# ClichBench DB for rds-aurora and rds-Postgres deployed to the same clusters
# we use for performance testing in pgbench-compare.
@@ -722,6 +785,7 @@ jobs:
run_in_parallel: false
save_perf_report: ${{ env.SAVE_PERF_REPORT }}
extra_params: -m remote_cluster --timeout 21600 -k test_user_examples
pg_version: ${{ env.DEFAULT_PG_VERSION }}
env:
VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"

View File

@@ -125,7 +125,11 @@ jobs:
check-codestyle-rust:
needs: [ check-permissions, build-build-tools-image ]
runs-on: [ self-hosted, gen3, small ]
strategy:
matrix:
arch: [ x64, arm64 ]
runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
@@ -193,291 +197,27 @@ jobs:
if: ${{ !cancelled() }}
run: cargo deny check --hide-inclusion-graph
build-neon:
needs: [ check-permissions, tag, build-build-tools-image ]
runs-on: [ self-hosted, gen3, large ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
# Raise locked memory limit for tokio-epoll-uring.
# On 5.10 LTS kernels < 5.10.162 (and generally mainline kernels < 5.12),
# io_uring will account the memory of the CQ and SQ as locked.
# More details: https://github.com/neondatabase/neon/issues/6373#issuecomment-1905814391
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
build-and-test-locally:
needs: [ tag, build-build-tools-image ]
strategy:
fail-fast: false
matrix:
build_type: [ debug, release ]
env:
BUILD_TYPE: ${{ matrix.build_type }}
GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
steps:
- name: Fix git ownership
run: |
# Workaround for `fatal: detected dubious ownership in repository at ...`
#
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
# Ref https://github.com/actions/checkout/issues/785
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 1
- name: Set pg 14 revision for caching
id: pg_v14_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
- name: Set pg 15 revision for caching
id: pg_v15_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
- name: Set pg 16 revision for caching
id: pg_v16_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
# Set some environment variables used by all the steps.
#
# CARGO_FLAGS is extra options to pass to "cargo build", "cargo test" etc.
# It also includes --features, if any
#
# CARGO_FEATURES is passed to "cargo metadata". It is separate from CARGO_FLAGS,
# because "cargo metadata" doesn't accept --release or --debug options
#
# We run tests with addtional features, that are turned off by default (e.g. in release builds), see
# corresponding Cargo.toml files for their descriptions.
- name: Set env variables
run: |
CARGO_FEATURES="--features testing"
if [[ $BUILD_TYPE == "debug" ]]; then
cov_prefix="scripts/coverage --profraw-prefix=$GITHUB_JOB --dir=/tmp/coverage run"
CARGO_FLAGS="--locked"
elif [[ $BUILD_TYPE == "release" ]]; then
cov_prefix=""
CARGO_FLAGS="--locked --release"
fi
{
echo "cov_prefix=${cov_prefix}"
echo "CARGO_FEATURES=${CARGO_FEATURES}"
echo "CARGO_FLAGS=${CARGO_FLAGS}"
echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo"
} >> $GITHUB_ENV
# Disabled for now
# Don't include the ~/.cargo/registry/src directory. It contains just
# uncompressed versions of the crates in ~/.cargo/registry/cache
# directory, and it's faster to let 'cargo' to rebuild it from the
# compressed crates.
# - name: Cache cargo deps
# id: cache_cargo
# uses: actions/cache@v4
# with:
# path: |
# ~/.cargo/registry/
# !~/.cargo/registry/src
# ~/.cargo/git/
# target/
# # Fall back to older versions of the key, if no cache for current Cargo.lock was found
# key: |
# v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
# v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v4
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v4
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v4
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
- name: Build postgres v14
if: steps.cache_pg_14.outputs.cache-hit != 'true'
run: mold -run make postgres-v14 -j$(nproc)
- name: Build postgres v15
if: steps.cache_pg_15.outputs.cache-hit != 'true'
run: mold -run make postgres-v15 -j$(nproc)
- name: Build postgres v16
if: steps.cache_pg_16.outputs.cache-hit != 'true'
run: mold -run make postgres-v16 -j$(nproc)
- name: Build neon extensions
run: mold -run make neon-pg-ext -j$(nproc)
- name: Build walproposer-lib
run: mold -run make walproposer-lib -j$(nproc)
- name: Run cargo build
run: |
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
export PQ_LIB_DIR
${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
# Do install *before* running rust tests because they might recompile the
# binaries with different features/flags.
- name: Install rust binaries
run: |
# Install target binaries
mkdir -p /tmp/neon/bin/
binaries=$(
${cov_prefix} cargo metadata $CARGO_FEATURES --format-version=1 --no-deps |
jq -r '.packages[].targets[] | select(.kind | index("bin")) | .name'
)
for bin in $binaries; do
SRC=target/$BUILD_TYPE/$bin
DST=/tmp/neon/bin/$bin
cp "$SRC" "$DST"
done
# Install test executables and write list of all binaries (for code coverage)
if [[ $BUILD_TYPE == "debug" ]]; then
# Keep bloated coverage data files away from the rest of the artifact
mkdir -p /tmp/coverage/
mkdir -p /tmp/neon/test_bin/
test_exe_paths=$(
${cov_prefix} cargo test $CARGO_FLAGS $CARGO_FEATURES --message-format=json --no-run |
jq -r '.executable | select(. != null)'
)
for bin in $test_exe_paths; do
SRC=$bin
DST=/tmp/neon/test_bin/$(basename $bin)
# We don't need debug symbols for code coverage, so strip them out to make
# the artifact smaller.
strip "$SRC" -o "$DST"
echo "$DST" >> /tmp/coverage/binaries.list
done
for bin in $binaries; do
echo "/tmp/neon/bin/$bin" >> /tmp/coverage/binaries.list
done
fi
- name: Run rust tests
env:
NEXTEST_RETRIES: 3
run: |
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
export PQ_LIB_DIR
LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
export LD_LIBRARY_PATH
#nextest does not yet support running doctests
cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
for io_engine in std-fs tokio-epoll-uring ; do
NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine ${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES
done
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
export REMOTE_STORAGE_S3_REGION=eu-central-1
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_s3)'
# Run separate tests for real Azure Blob Storage
# XXX: replace region with `eu-central-1`-like region
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
${cov_prefix} cargo nextest run $CARGO_FLAGS $CARGO_FEATURES -E 'package(remote_storage)' -E 'test(test_real_azure)'
- name: Install postgres binaries
run: cp -a pg_install /tmp/neon/pg_install
- name: Upload Neon artifact
uses: ./.github/actions/upload
with:
name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
path: /tmp/neon
# XXX: keep this after the binaries.list is formed, so the coverage can properly work later
- name: Merge and upload coverage data
if: matrix.build_type == 'debug'
uses: ./.github/actions/save-coverage-data
regress-tests:
needs: [ check-permissions, build-neon, build-build-tools-image, tag ]
runs-on: [ self-hosted, gen3, large ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
# for changed limits, see comments on `options:` earlier in this file
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
strategy:
fail-fast: false
matrix:
build_type: [ debug, release ]
pg_version: [ v14, v15, v16 ]
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 1
- name: Pytest regression tests
uses: ./.github/actions/run-python-test-set
timeout-minutes: 60
with:
build_type: ${{ matrix.build_type }}
test_selection: regress
needs_postgres_source: true
run_with_real_s3: true
real_s3_bucket: neon-github-ci-tests
real_s3_region: eu-central-1
rerun_flaky: true
pg_version: ${{ matrix.pg_version }}
env:
TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
PAGESERVER_GET_VECTORED_IMPL: vectored
PAGESERVER_GET_IMPL: vectored
PAGESERVER_VALIDATE_VEC_GET: true
# Temporary disable this step until we figure out why it's so flaky
# Ref https://github.com/neondatabase/neon/issues/4540
- name: Merge and upload coverage data
if: |
false &&
matrix.build_type == 'debug' && matrix.pg_version == 'v14'
uses: ./.github/actions/save-coverage-data
arch: [ x64 ]
build-type: [ debug, release ]
include:
- build-type: release
arch: arm64
uses: ./.github/workflows/_build-and-test-locally.yml
with:
arch: ${{ matrix.arch }}
build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}
build-tag: ${{ needs.tag.outputs.build-tag }}
build-type: ${{ matrix.build-type }}
secrets: inherit
# Keep `benchmarks` job outside of `build-and-test-locally` workflow to make job failures non-blocking
get-benchmarks-durations:
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
outputs:
json: ${{ steps.get-benchmark-durations.outputs.json }}
needs: [ check-permissions, build-build-tools-image ]
@@ -488,7 +228,6 @@ jobs:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
steps:
- name: Checkout
uses: actions/checkout@v4
@@ -513,7 +252,8 @@ jobs:
echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT
benchmarks:
needs: [ check-permissions, build-neon, build-build-tools-image, get-benchmarks-durations ]
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
runs-on: [ self-hosted, gen3, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -522,7 +262,6 @@ jobs:
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
# for changed limits, see comments on `options:` earlier in this file
options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
strategy:
fail-fast: false
matrix:
@@ -570,7 +309,7 @@ jobs:
SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
create-test-report:
needs: [ check-permissions, regress-tests, coverage-report, benchmarks, build-build-tools-image ]
needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image ]
if: ${{ !cancelled() && contains(fromJSON('["skipped", "success"]'), needs.check-permissions.result) }}
outputs:
report-url: ${{ steps.create-allure-report.outputs.report-url }}
@@ -621,7 +360,7 @@ jobs:
})
coverage-report:
needs: [ check-permissions, regress-tests, build-build-tools-image ]
needs: [ check-permissions, build-build-tools-image, build-and-test-locally ]
runs-on: [ self-hosted, gen3, small ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -772,7 +511,8 @@ jobs:
pull: true
file: Dockerfile
cache-from: type=registry,ref=neondatabase/neon:cache-${{ matrix.arch }}
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }}
# 23.07.2024 temporarily disable cache saving in the registry as it is very slow
# cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon:cache-{0},mode=max', matrix.arch) || '' }}
tags: |
neondatabase/neon:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
@@ -865,7 +605,8 @@ jobs:
pull: true
file: Dockerfile.compute-node
cache-from: type=registry,ref=neondatabase/compute-node-${{ matrix.version }}:cache-${{ matrix.arch }}
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
# 23.07.2024 temporarily disable cache saving in the registry as it is very slow
# cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/compute-node-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
tags: |
neondatabase/compute-node-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.arch }}
@@ -885,7 +626,8 @@ jobs:
file: Dockerfile.compute-node
target: neon-pg-ext-test
cache-from: type=registry,ref=neondatabase/neon-test-extensions-${{ matrix.version }}:cache-${{ matrix.arch }}
cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
# 23.07.2024 temporarily disable cache saving in the registry as it is very slow
# cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=neondatabase/neon-test-extensions-{0}:cache-{1},mode=max', matrix.version, matrix.arch) || '' }}
tags: |
neondatabase/neon-test-extensions-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}-${{ matrix.arch }}
@@ -1223,7 +965,7 @@ jobs:
exit 1
deploy:
needs: [ check-permissions, promote-images, tag, regress-tests, trigger-custom-extensions-build-and-wait ]
needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
if: github.ref_name == 'main' || github.ref_name == 'release'|| github.ref_name == 'release-proxy'
runs-on: [ self-hosted, gen3, small ]
@@ -1324,7 +1066,7 @@ jobs:
})
promote-compatibility-data:
needs: [ check-permissions, promote-images, tag, regress-tests ]
needs: [ check-permissions, promote-images, tag, build-and-test-locally ]
if: github.ref_name == 'release'
runs-on: [ self-hosted, gen3, small ]
@@ -1363,7 +1105,7 @@ jobs:
done
pin-build-tools-image:
needs: [ build-build-tools-image, promote-images, regress-tests ]
needs: [ build-build-tools-image, promote-images, build-and-test-locally ]
if: github.ref_name == 'main'
uses: ./.github/workflows/pin-build-tools-image.yml
with:
@@ -1385,7 +1127,7 @@ jobs:
needs:
- check-codestyle-python
- check-codestyle-rust
- regress-tests
- build-and-test-locally
- test-images
runs-on: ubuntu-22.04
steps:

View File

@@ -133,221 +133,6 @@ jobs:
- name: Check that no warnings are produced
run: ./run_clippy.sh
check-linux-arm-build:
needs: [ check-permissions, build-build-tools-image ]
timeout-minutes: 90
runs-on: [ self-hosted, small-arm64 ]
env:
# Use release build only, to have less debug info around
# Hence keeping target/ (and general cache size) smaller
BUILD_TYPE: release
CARGO_FEATURES: --features testing
CARGO_FLAGS: --release
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
steps:
- name: Fix git ownership
run: |
# Workaround for `fatal: detected dubious ownership in repository at ...`
#
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
# Ref https://github.com/actions/checkout/issues/785
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 1
- name: Set pg 14 revision for caching
id: pg_v14_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
- name: Set pg 15 revision for caching
id: pg_v15_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
- name: Set pg 16 revision for caching
id: pg_v16_rev
run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
- name: Set env variables
run: |
echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
- name: Cache postgres v14 build
id: cache_pg_14
uses: actions/cache@v4
with:
path: pg_install/v14
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v15 build
id: cache_pg_15
uses: actions/cache@v4
with:
path: pg_install/v15
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Cache postgres v16 build
id: cache_pg_16
uses: actions/cache@v4
with:
path: pg_install/v16
key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
- name: Build postgres v14
if: steps.cache_pg_14.outputs.cache-hit != 'true'
run: mold -run make postgres-v14 -j$(nproc)
- name: Build postgres v15
if: steps.cache_pg_15.outputs.cache-hit != 'true'
run: mold -run make postgres-v15 -j$(nproc)
- name: Build postgres v16
if: steps.cache_pg_16.outputs.cache-hit != 'true'
run: mold -run make postgres-v16 -j$(nproc)
- name: Build neon extensions
run: mold -run make neon-pg-ext -j$(nproc)
- name: Build walproposer-lib
run: mold -run make walproposer-lib -j$(nproc)
- name: Run cargo build
run: |
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
export PQ_LIB_DIR
mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
- name: Run cargo test
env:
NEXTEST_RETRIES: 3
run: |
PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
export PQ_LIB_DIR
LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
export LD_LIBRARY_PATH
cargo nextest run $CARGO_FEATURES -j$(nproc)
# Run separate tests for real S3
export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
export REMOTE_STORAGE_S3_BUCKET=neon-github-ci-tests
export REMOTE_STORAGE_S3_REGION=eu-central-1
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo nextest run --package remote_storage --test test_real_s3 -j$(nproc)
# Run separate tests for real Azure Blob Storage
# XXX: replace region with `eu-central-1`-like region
export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
# Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
cargo nextest run --package remote_storage --test test_real_azure -j$(nproc)
check-codestyle-rust-arm:
needs: [ check-permissions, build-build-tools-image ]
timeout-minutes: 90
runs-on: [ self-hosted, small-arm64 ]
container:
image: ${{ needs.build-build-tools-image.outputs.image }}
credentials:
username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
options: --init
strategy:
fail-fast: false
matrix:
build_type: [ debug, release ]
steps:
- name: Fix git ownership
run: |
# Workaround for `fatal: detected dubious ownership in repository at ...`
#
# Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
# Ref https://github.com/actions/checkout/issues/785
#
git config --global --add safe.directory ${{ github.workspace }}
git config --global --add safe.directory ${GITHUB_WORKSPACE}
for r in 14 15 16; do
git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
done
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
fetch-depth: 1
# Some of our rust modules use FFI and need those to be checked
- name: Get postgres headers
run: make postgres-headers -j$(nproc)
# cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
# This will catch compiler & clippy warnings in all feature combinations.
# TODO: use cargo hack for build and test as well, but, that's quite expensive.
# NB: keep clippy args in sync with ./run_clippy.sh
- run: |
CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
echo "No clippy args found in .neon_clippy_args"
exit 1
fi
echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
- name: Run cargo clippy (debug)
if: matrix.build_type == 'debug'
run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
- name: Run cargo clippy (release)
if: matrix.build_type == 'release'
run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
- name: Check documentation generation
if: matrix.build_type == 'release'
run: cargo doc --workspace --no-deps --document-private-items -j$(nproc)
env:
RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
# Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
- name: Check formatting
if: ${{ !cancelled() && matrix.build_type == 'release' }}
run: cargo fmt --all -- --check
# https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
- name: Check rust dependencies
if: ${{ !cancelled() && matrix.build_type == 'release' }}
run: |
cargo hakari generate --diff # workspace-hack Cargo.toml is up-to-date
cargo hakari manage-deps --dry-run # all workspace crates depend on workspace-hack
# https://github.com/EmbarkStudios/cargo-deny
- name: Check rust licenses/bans/advisories/sources
if: ${{ !cancelled() && matrix.build_type == 'release' }}
run: cargo deny check
gather-rust-build-stats:
needs: [ check-permissions, build-build-tools-image ]
if: |

171
Cargo.lock generated
View File

@@ -261,15 +261,6 @@ version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"
[[package]]
name = "atomic-polyfill"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c314e70d181aa6053b26e3f7fbf86d1dfff84f816a6175b967666b3506ef7289"
dependencies = [
"critical-section",
]
[[package]]
name = "atomic-take"
version = "1.1.0"
@@ -1236,6 +1227,7 @@ dependencies = [
"regex",
"remote_storage",
"reqwest 0.12.4",
"rlimit",
"rust-ini",
"serde",
"serde_json",
@@ -1367,6 +1359,7 @@ dependencies = [
"tracing",
"url",
"utils",
"whoami",
"workspace_hack",
]
@@ -1449,12 +1442,6 @@ dependencies = [
"itertools",
]
[[package]]
name = "critical-section"
version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52"
[[package]]
name = "crossbeam-channel"
version = "0.5.8"
@@ -2027,16 +2014,6 @@ dependencies = [
"tokio-util",
]
[[package]]
name = "fs2"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213"
dependencies = [
"libc",
"winapi",
]
[[package]]
name = "fsevent-sys"
version = "4.1.0"
@@ -2290,15 +2267,6 @@ dependencies = [
"num-traits",
]
[[package]]
name = "hash32"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "47d60b12902ba28e2730cd37e95b8c9223af2808df9e902d4df49588d1470606"
dependencies = [
"byteorder",
]
[[package]]
name = "hashbrown"
version = "0.12.3"
@@ -2347,18 +2315,6 @@ dependencies = [
"num-traits",
]
[[package]]
name = "heapless"
version = "0.8.0"
source = "git+https://github.com/japaric/heapless.git?rev=644653bf3b831c6bb4963be2de24804acf5e5001#644653bf3b831c6bb4963be2de24804acf5e5001"
dependencies = [
"atomic-polyfill",
"hash32",
"rustc_version",
"spin 0.9.8",
"stable_deref_trait",
]
[[package]]
name = "heck"
version = "0.4.1"
@@ -2392,16 +2348,6 @@ version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46"
[[package]]
name = "histogram"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e673d137229619d5c2c8903b6ed5852b43636c0017ff2e66b1aafb8ccf04b80b"
dependencies = [
"serde",
"thiserror",
]
[[package]]
name = "hmac"
version = "0.12.1"
@@ -3242,16 +3188,6 @@ dependencies = [
"winapi",
]
[[package]]
name = "nu-ansi-term"
version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
dependencies = [
"overload",
"winapi",
]
[[package]]
name = "num"
version = "0.4.1"
@@ -3547,12 +3483,6 @@ version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
[[package]]
name = "overload"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "p256"
version = "0.11.1"
@@ -4413,6 +4343,7 @@ dependencies = [
"tracing-opentelemetry",
"tracing-subscriber",
"tracing-utils",
"typed-json",
"url",
"urlencoding",
"utils",
@@ -4611,6 +4542,15 @@ dependencies = [
"bitflags 1.3.2",
]
[[package]]
name = "redox_syscall"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
dependencies = [
"bitflags 1.3.2",
]
[[package]]
name = "regex"
version = "1.10.2"
@@ -4672,6 +4612,7 @@ name = "remote_storage"
version = "0.1.0"
dependencies = [
"anyhow",
"async-stream",
"async-trait",
"aws-config",
"aws-credential-types",
@@ -4901,6 +4842,15 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "rlimit"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3560f70f30a0f16d11d01ed078a07740fe6b489667abc7c7b029155d9f21c3d8"
dependencies = [
"libc",
]
[[package]]
name = "routerify"
version = "3.0.0"
@@ -5169,7 +5119,6 @@ dependencies = [
"crc32c",
"desim",
"fail",
"fs2",
"futures",
"git-version",
"hex",
@@ -5196,6 +5145,8 @@ dependencies = [
"sha2",
"signal-hook",
"storage_broker",
"strum",
"strum_macros",
"thiserror",
"tokio",
"tokio-io-timeout",
@@ -5704,9 +5655,6 @@ name = "spin"
version = "0.9.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
dependencies = [
"lock_api",
]
[[package]]
name = "spki"
@@ -5728,12 +5676,6 @@ dependencies = [
"der 0.7.8",
]
[[package]]
name = "stable_deref_trait"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "static_assertions"
version = "1.1.0"
@@ -5810,6 +5752,28 @@ dependencies = [
"workspace_hack",
]
[[package]]
name = "storage_controller_client"
version = "0.1.0"
dependencies = [
"anyhow",
"async-trait",
"bytes",
"futures",
"pageserver_api",
"pageserver_client",
"postgres",
"reqwest 0.12.4",
"serde",
"thiserror",
"tokio",
"tokio-postgres",
"tokio-stream",
"tokio-util",
"utils",
"workspace_hack",
]
[[package]]
name = "storage_scrubber"
version = "0.1.0"
@@ -5829,7 +5793,6 @@ dependencies = [
"futures",
"futures-util",
"hex",
"histogram",
"humantime",
"itertools",
"once_cell",
@@ -5844,6 +5807,7 @@ dependencies = [
"serde",
"serde_json",
"serde_with",
"storage_controller_client",
"thiserror",
"tokio",
"tokio-postgres",
@@ -5873,6 +5837,7 @@ dependencies = [
"reqwest 0.12.4",
"serde",
"serde_json",
"storage_controller_client",
"thiserror",
"tokio",
"tracing",
@@ -6500,17 +6465,6 @@ version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
[[package]]
name = "trace"
version = "0.1.0"
dependencies = [
"anyhow",
"clap",
"pageserver_api",
"utils",
"workspace_hack",
]
[[package]]
name = "tracing"
version = "0.1.37"
@@ -6610,7 +6564,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
dependencies = [
"matchers",
"nu-ansi-term",
"once_cell",
"regex",
"serde",
@@ -6675,6 +6628,16 @@ dependencies = [
"static_assertions",
]
[[package]]
name = "typed-json"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6024a8d0025400b3f6b189366e9aa92012cf9c4fe1cd2620848dd61425c49eed"
dependencies = [
"serde",
"serde_json",
]
[[package]]
name = "typenum"
version = "1.16.0"
@@ -6809,7 +6772,6 @@ dependencies = [
"criterion",
"fail",
"futures",
"heapless",
"hex",
"hex-literal",
"humantime",
@@ -6971,6 +6933,12 @@ version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "wasite"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b"
[[package]]
name = "wasm-bindgen"
version = "0.2.92"
@@ -7123,6 +7091,17 @@ dependencies = [
"once_cell",
]
[[package]]
name = "whoami"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9"
dependencies = [
"redox_syscall 0.4.1",
"wasite",
"web-sys",
]
[[package]]
name = "winapi"
version = "0.3.9"

View File

@@ -13,9 +13,9 @@ members = [
"safekeeper",
"storage_broker",
"storage_controller",
"storage_controller/client",
"storage_scrubber",
"workspace_hack",
"trace",
"libs/compute_api",
"libs/pageserver_api",
"libs/postgres_ffi",
@@ -84,7 +84,6 @@ enumset = "1.0.12"
fail = "0.5.0"
fallible-iterator = "0.2"
framed-websockets = { version = "0.1.0", git = "https://github.com/neondatabase/framed-websockets" }
fs2 = "0.4.3"
futures = "0.3"
futures-core = "0.3"
futures-util = "0.3"
@@ -184,14 +183,16 @@ tower-service = "0.3.2"
tracing = "0.1"
tracing-error = "0.2.0"
tracing-opentelemetry = "0.21.0"
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
twox-hash = { version = "1.6.3", default-features = false }
typed-json = "0.1"
url = "2.2"
urlencoding = "2.1"
uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
walkdir = "2.3.2"
rustls-native-certs = "0.7"
x509-parser = "0.15"
whoami = "1.5.1"
## TODO replace this with tracing
env_logger = "0.10"
@@ -203,9 +204,6 @@ postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git",
postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
## Other git libraries
heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
## Local libraries
compute_api = { version = "0.1", path = "./libs/compute_api/" }
consumption_metrics = { version = "0.1", path = "./libs/consumption_metrics/" }
@@ -221,6 +219,7 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
desim = { version = "0.1", path = "./libs/desim" }
storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
storage_controller_client = { path = "./storage_controller/client" }
tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
utils = { version = "0.1", path = "./libs/utils/" }

View File

@@ -93,13 +93,14 @@ COPY --from=pg-build /home/nonroot/postgres_install.tar.gz /data/
# By default, pageserver uses `.neon/` working directory in WORKDIR, so create one and fill it with the dummy config.
# Now, when `docker run ... pageserver` is run, it can start without errors, yet will have some default dummy values.
RUN mkdir -p /data/.neon/ && chown -R neon:neon /data/.neon/ \
&& /usr/local/bin/pageserver -D /data/.neon/ --init \
-c "id=1234" \
-c "broker_endpoint='http://storage_broker:50051'" \
-c "pg_distrib_dir='/usr/local/'" \
-c "listen_pg_addr='0.0.0.0:6400'" \
-c "listen_http_addr='0.0.0.0:9898'"
RUN mkdir -p /data/.neon/ && \
echo "id=1234" > "/data/.neon/identity.toml" && \
echo "broker_endpoint='http://storage_broker:50051'\n" \
"pg_distrib_dir='/usr/local/'\n" \
"listen_pg_addr='0.0.0.0:6400'\n" \
"listen_http_addr='0.0.0.0:9898'\n" \
> /data/.neon/pageserver.toml && \
chown -R neon:neon /data/.neon
# When running a binary that links with libpq, default to using our most recent postgres version. Binaries
# that want a particular postgres version will select it explicitly: this is just a default.
@@ -110,3 +111,6 @@ VOLUME ["/data"]
USER neon
EXPOSE 6400
EXPOSE 9898
CMD /usr/local/bin/pageserver -D /data/.neon

View File

@@ -311,9 +311,12 @@ RUN wget https://github.com/iCyberon/pg_hashids/archive/refs/tags/v1.2.1.tar.gz
FROM build-deps AS rum-pg-build
COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
COPY patches/rum.patch /rum.patch
RUN wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
patch -p1 < /rum.patch && \
make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
echo 'trusted = true' >> /usr/local/pgsql/share/extension/rum.control

View File

@@ -69,6 +69,8 @@ CARGO_CMD_PREFIX += CARGO_TERM_PROGRESS_WHEN=never CI=1
# Set PQ_LIB_DIR to make sure `storage_controller` get linked with bundled libpq (through diesel)
CARGO_CMD_PREFIX += PQ_LIB_DIR=$(POSTGRES_INSTALL_DIR)/v16/lib
CACHEDIR_TAG_CONTENTS := "Signature: 8a477f597d28d172789f06886806bc55"
#
# Top level Makefile to build Neon and PostgreSQL
#
@@ -79,15 +81,24 @@ all: neon postgres neon-pg-ext
#
# The 'postgres_ffi' depends on the Postgres headers.
.PHONY: neon
neon: postgres-headers walproposer-lib
neon: postgres-headers walproposer-lib cargo-target-dir
+@echo "Compiling Neon"
$(CARGO_CMD_PREFIX) cargo build $(CARGO_BUILD_FLAGS)
.PHONY: cargo-target-dir
cargo-target-dir:
# https://github.com/rust-lang/cargo/issues/14281
mkdir -p target
test -e target/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > target/CACHEDIR.TAG
### PostgreSQL parts
# Some rules are duplicated for Postgres v14 and 15. We may want to refactor
# to avoid the duplication in the future, but it's tolerable for now.
#
$(POSTGRES_INSTALL_DIR)/build/%/config.status:
mkdir -p $(POSTGRES_INSTALL_DIR)
test -e $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG || echo "$(CACHEDIR_TAG_CONTENTS)" > $(POSTGRES_INSTALL_DIR)/CACHEDIR.TAG
+@echo "Configuring Postgres $* build"
@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \

View File

@@ -44,3 +44,4 @@ vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
zstd = "0.13"
bytes = "1.0"
rust-ini = "0.20.0"
rlimit = "0.10.1"

View File

@@ -6,7 +6,7 @@
//! - Every start is a fresh start, so the data directory is removed and
//! initialized again on each run.
//! - If remote_extension_config is provided, it will be used to fetch extensions list
//! and download `shared_preload_libraries` from the remote storage.
//! and download `shared_preload_libraries` from the remote storage.
//! - Next it will put configuration files into the `PGDATA` directory.
//! - Sync safekeepers and get commit LSN.
//! - Get `basebackup` from pageserver using the returned on the previous step LSN.
@@ -33,7 +33,6 @@
//! -b /usr/local/bin/postgres \
//! -r http://pg-ext-s3-gateway \
//! ```
//!
use std::collections::HashMap;
use std::fs::File;
use std::path::Path;
@@ -64,6 +63,7 @@ use compute_tools::monitor::launch_monitor;
use compute_tools::params::*;
use compute_tools::spec::*;
use compute_tools::swap::resize_swap;
use rlimit::{setrlimit, Resource};
// this is an arbitrary build tag. Fine as a default / for testing purposes
// in-case of not-set environment var
@@ -72,6 +72,9 @@ const BUILD_TAG_DEFAULT: &str = "latest";
fn main() -> Result<()> {
let (build_tag, clap_args) = init()?;
// enable core dumping for all child processes
setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
let (pg_handle, start_pg_result) = {
// Enter startup tracing context
let _startup_context_guard = startup_context_from_env();

View File

@@ -56,6 +56,7 @@ pub struct ComputeNode {
/// - we push new spec and it does reconfiguration
/// - but then something happens and compute pod / VM is destroyed,
/// so k8s controller starts it again with the **old** spec
///
/// and the same for empty computes:
/// - we started compute without any spec
/// - we push spec and it does configuration
@@ -1116,7 +1117,7 @@ impl ComputeNode {
// EKS worker nodes have following core dump settings:
// /proc/sys/kernel/core_pattern -> core
// /proc/sys/kernel/core_uses_pid -> 1
// ulimint -c -> unlimited
// ulimit -c -> unlimited
// which results in core dumps being written to postgres data directory as core.<pid>.
//
// Use that as a default location and pattern, except macos where core dumps are written

View File

@@ -9,6 +9,9 @@ pub(crate) struct MigrationRunner<'m> {
impl<'m> MigrationRunner<'m> {
pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
// The neon_migration.migration_id::id column is a bigint, which is equivalent to an i64
assert!(migrations.len() + 1 < i64::MAX as usize);
Self { client, migrations }
}
@@ -22,11 +25,8 @@ impl<'m> MigrationRunner<'m> {
Ok(row.get::<&str, i64>("id"))
}
fn update_migration_id(&mut self) -> Result<()> {
let setval = format!(
"UPDATE neon_migration.migration_id SET id={}",
self.migrations.len()
);
fn update_migration_id(&mut self, migration_id: i64) -> Result<()> {
let setval = format!("UPDATE neon_migration.migration_id SET id={}", migration_id);
self.client
.simple_query(&setval)
@@ -57,44 +57,49 @@ impl<'m> MigrationRunner<'m> {
pub fn run_migrations(mut self) -> Result<()> {
self.prepare_migrations()?;
let mut current_migration: usize = self.get_migration_id()? as usize;
let starting_migration_id = current_migration;
let query = "BEGIN";
self.client
.simple_query(query)
.context("run_migrations begin")?;
let mut current_migration = self.get_migration_id()? as usize;
while current_migration < self.migrations.len() {
macro_rules! migration_id {
($cm:expr) => {
($cm + 1) as i64
};
}
let migration = self.migrations[current_migration];
if migration.starts_with("-- SKIP") {
info!("Skipping migration id={}", current_migration);
info!("Skipping migration id={}", migration_id!(current_migration));
} else {
info!(
"Running migration id={}:\n{}\n",
current_migration, migration
migration_id!(current_migration),
migration
);
self.client
.simple_query("BEGIN")
.context("begin migration")?;
self.client.simple_query(migration).with_context(|| {
format!("run_migration current_migration={}", current_migration)
format!(
"run_migrations migration id={}",
migration_id!(current_migration)
)
})?;
// Migration IDs start at 1
self.update_migration_id(migration_id!(current_migration))?;
self.client
.simple_query("COMMIT")
.context("commit migration")?;
info!("Finished migration id={}", migration_id!(current_migration));
}
current_migration += 1;
}
self.update_migration_id()?;
let query = "COMMIT";
self.client
.simple_query(query)
.context("run_migrations commit")?;
info!(
"Ran {} migrations",
(self.migrations.len() - starting_migration_id)
);
Ok(())
}
}

View File

@@ -0,0 +1,7 @@
DO $$
BEGIN
IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO neon_superuser';
EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO neon_superuser';
END IF;
END $$;

View File

@@ -777,19 +777,22 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
// Add new migrations in numerical order.
let migrations = [
include_str!("./migrations/0000-neon_superuser_bypass_rls.sql"),
include_str!("./migrations/0001-alter_roles.sql"),
include_str!("./migrations/0002-grant_pg_create_subscription_to_neon_superuser.sql"),
include_str!("./migrations/0003-grant_pg_monitor_to_neon_superuser.sql"),
include_str!("./migrations/0004-grant_all_on_tables_to_neon_superuser.sql"),
include_str!("./migrations/0005-grant_all_on_sequences_to_neon_superuser.sql"),
include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"),
include_str!("./migrations/0002-alter_roles.sql"),
include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"),
include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"),
include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"),
include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"),
include_str!(
"./migrations/0006-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
"./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
),
include_str!(
"./migrations/0007-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
"./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
),
include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"),
include_str!(
"./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
),
include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
];
MigrationRunner::new(client, &migrations).run_migrations()?;

View File

@@ -40,6 +40,7 @@ safekeeper_api.workspace = true
postgres_connection.workspace = true
storage_broker.workspace = true
utils.workspace = true
whoami.workspace = true
compute_api.workspace = true
workspace_hack.workspace = true

View File

@@ -1,9 +1,9 @@
//! Code to manage the storage broker
//!
//! In the local test environment, the data for each safekeeper is stored in
//! In the local test environment, the storage broker stores its data directly in
//!
//! ```text
//! .neon/safekeepers/<safekeeper id>
//! .neon
//! ```
use std::time::Duration;

View File

@@ -1,8 +1,10 @@
//! Code to manage pageservers
//!
//! In the local test environment, the pageserver stores its data directly in
//! In the local test environment, the data for each pageserver is stored in
//!
//! .neon/
//! ```text
//! .neon/pageserver_<pageserver_id>
//! ```
//!
use std::collections::HashMap;
@@ -23,6 +25,7 @@ use pageserver_client::mgmt_api;
use postgres_backend::AuthType;
use postgres_connection::{parse_host_port, PgConnectionConfig};
use utils::auth::{Claims, Scope};
use utils::id::NodeId;
use utils::{
id::{TenantId, TimelineId},
lsn::Lsn,
@@ -72,6 +75,10 @@ impl PageServerNode {
}
}
fn pageserver_make_identity_toml(&self, node_id: NodeId) -> toml_edit::Document {
toml_edit::Document::from_str(&format!("id={node_id}")).unwrap()
}
fn pageserver_init_make_toml(
&self,
conf: NeonLocalInitPageserverConf,
@@ -184,6 +191,19 @@ impl PageServerNode {
.write_all(config.to_string().as_bytes())
.context("write pageserver toml")?;
drop(config_file);
let identity_file_path = datadir.join("identity.toml");
let mut identity_file = std::fs::OpenOptions::new()
.create_new(true)
.write(true)
.open(identity_file_path)
.with_context(|| format!("open identity toml for write: {config_file_path:?}"))?;
let identity_toml = self.pageserver_make_identity_toml(node_id);
identity_file
.write_all(identity_toml.to_string().as_bytes())
.context("write identity toml")?;
drop(identity_toml);
// TODO: invoke a TBD config-check command to validate that pageserver will start with the written config
// Write metadata file, used by pageserver on startup to register itself with
@@ -349,11 +369,6 @@ impl PageServerNode {
.map(|x| x.parse::<NonZeroU64>())
.transpose()
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
trace_read_requests: settings
.remove("trace_read_requests")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'trace_read_requests' as bool")?,
eviction_policy: settings
.remove("eviction_policy")
.map(serde_json::from_str)
@@ -454,11 +469,6 @@ impl PageServerNode {
.map(|x| x.parse::<NonZeroU64>())
.transpose()
.context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
trace_read_requests: settings
.remove("trace_read_requests")
.map(|x| x.parse::<bool>())
.transpose()
.context("Failed to parse 'trace_read_requests' as bool")?,
eviction_policy: settings
.remove("eviction_policy")
.map(serde_json::from_str)

View File

@@ -29,7 +29,6 @@ use utils::{
pub struct StorageController {
env: LocalEnv,
listen: String,
path: Utf8PathBuf,
private_key: Option<Vec<u8>>,
public_key: Option<String>,
postgres_port: u16,
@@ -41,6 +40,8 @@ const COMMAND: &str = "storage_controller";
const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
const DB_NAME: &str = "storage_controller";
#[derive(Serialize, Deserialize)]
pub struct AttachHookRequest {
pub tenant_shard_id: TenantShardId,
@@ -65,10 +66,6 @@ pub struct InspectResponse {
impl StorageController {
pub fn from_env(env: &LocalEnv) -> Self {
let path = Utf8PathBuf::from_path_buf(env.base_data_dir.clone())
.unwrap()
.join("attachments.json");
// Makes no sense to construct this if pageservers aren't going to use it: assume
// pageservers have control plane API set
let listen_url = env.control_plane_api.clone().unwrap();
@@ -128,7 +125,6 @@ impl StorageController {
Self {
env: env.clone(),
path,
listen,
private_key,
public_key,
@@ -203,7 +199,6 @@ impl StorageController {
///
/// Returns the database url
pub async fn setup_database(&self) -> anyhow::Result<String> {
const DB_NAME: &str = "storage_controller";
let database_url = format!("postgresql://localhost:{}/{DB_NAME}", self.postgres_port);
let pg_bin_dir = self.get_pg_bin_dir().await?;
@@ -232,6 +227,30 @@ impl StorageController {
Ok(database_url)
}
pub async fn connect_to_database(
&self,
) -> anyhow::Result<(
tokio_postgres::Client,
tokio_postgres::Connection<tokio_postgres::Socket, tokio_postgres::tls::NoTlsStream>,
)> {
tokio_postgres::Config::new()
.host("localhost")
.port(self.postgres_port)
// The user is the ambient operating system user name.
// That is an impurity which we want to fix in => TODO https://github.com/neondatabase/neon/issues/8400
//
// Until we get there, use the ambient operating system user name.
// Recent tokio-postgres versions default to this if the user isn't specified.
// But tokio-postgres fork doesn't have this upstream commit:
// https://github.com/sfackler/rust-postgres/commit/cb609be758f3fb5af537f04b584a2ee0cebd5e79
// => we should rebase our fork => TODO https://github.com/neondatabase/neon/issues/8399
.user(&whoami::username())
.dbname(DB_NAME)
.connect(tokio_postgres::NoTls)
.await
.map_err(anyhow::Error::new)
}
pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
// Start a vanilla Postgres process used by the storage controller for persistence.
let pg_data_path = Utf8PathBuf::from_path_buf(self.env.base_data_dir.clone())
@@ -256,18 +275,21 @@ impl StorageController {
if !status.success() {
anyhow::bail!("initdb failed with status {status}");
}
// Write a minimal config file:
// - Specify the port, since this is chosen dynamically
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
// the storage controller we don't want a slow local disk to interfere with that.
tokio::fs::write(
&pg_data_path.join("postgresql.conf"),
format!("port = {}\nfsync=off\n", self.postgres_port),
)
.await?;
};
// Write a minimal config file:
// - Specify the port, since this is chosen dynamically
// - Switch off fsync, since we're running on lightweight test environments and when e.g. scale testing
// the storage controller we don't want a slow local disk to interfere with that.
//
// NB: it's important that we rewrite this file on each start command so we propagate changes
// from `LocalEnv`'s config file (`.neon/config`).
tokio::fs::write(
&pg_data_path.join("postgresql.conf"),
format!("port = {}\nfsync=off\n", self.postgres_port),
)
.await?;
println!("Starting storage controller database...");
let db_start_args = [
"-w",
@@ -296,11 +318,38 @@ impl StorageController {
// Run migrations on every startup, in case something changed.
let database_url = self.setup_database().await?;
// We support running a startup SQL script to fiddle with the database before we launch storcon.
// This is used by the test suite.
let startup_script_path = self
.env
.base_data_dir
.join("storage_controller_db.startup.sql");
let startup_script = match tokio::fs::read_to_string(&startup_script_path).await {
Ok(script) => {
tokio::fs::remove_file(startup_script_path).await?;
script
}
Err(e) => {
if e.kind() == std::io::ErrorKind::NotFound {
// always run some startup script so that this code path doesn't bit rot
"BEGIN; COMMIT;".to_string()
} else {
anyhow::bail!("Failed to read startup script: {e}")
}
}
};
let (mut client, conn) = self.connect_to_database().await?;
let conn = tokio::spawn(conn);
let tx = client.build_transaction();
let tx = tx.start().await?;
tx.batch_execute(&startup_script).await?;
tx.commit().await?;
drop(client);
conn.await??;
let mut args = vec![
"-l",
&self.listen,
"-p",
self.path.as_ref(),
"--dev",
"--database-url",
&database_url,

View File

@@ -17,6 +17,7 @@ pageserver_client.workspace = true
reqwest.workspace = true
serde.workspace = true
serde_json = { workspace = true, features = ["raw_value"] }
storage_controller_client.workspace = true
thiserror.workspace = true
tokio.workspace = true
tracing.workspace = true

View File

@@ -14,15 +14,15 @@ use pageserver_api::{
},
shard::{ShardStripeSize, TenantShardId},
};
use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
use pageserver_client::mgmt_api::{self};
use reqwest::{Method, StatusCode, Url};
use serde::{de::DeserializeOwned, Serialize};
use utils::id::{NodeId, TenantId};
use pageserver_api::controller_api::{
NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
TenantShardMigrateRequest, TenantShardMigrateResponse,
};
use storage_controller_client::control_api::Client;
#[derive(Subcommand, Debug)]
enum Command {
@@ -56,6 +56,10 @@ enum Command {
#[arg(long)]
scheduling: Option<NodeSchedulingPolicy>,
},
NodeDelete {
#[arg(long)]
node_id: NodeId,
},
/// Modify a tenant's policies in the storage controller
TenantPolicy {
#[arg(long)]
@@ -245,64 +249,6 @@ impl FromStr for NodeAvailabilityArg {
}
}
struct Client {
base_url: Url,
jwt_token: Option<String>,
client: reqwest::Client,
}
impl Client {
fn new(base_url: Url, jwt_token: Option<String>) -> Self {
Self {
base_url,
jwt_token,
client: reqwest::ClientBuilder::new()
.build()
.expect("Failed to construct http client"),
}
}
/// Simple HTTP request wrapper for calling into storage controller
async fn dispatch<RQ, RS>(
&self,
method: Method,
path: String,
body: Option<RQ>,
) -> mgmt_api::Result<RS>
where
RQ: Serialize + Sized,
RS: DeserializeOwned + Sized,
{
// The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
// for general purpose API access.
let url = Url::from_str(&format!(
"http://{}:{}/{path}",
self.base_url.host_str().unwrap(),
self.base_url.port().unwrap()
))
.unwrap();
let mut builder = self.client.request(method, url);
if let Some(body) = body {
builder = builder.json(&body)
}
if let Some(jwt_token) = &self.jwt_token {
builder = builder.header(
reqwest::header::AUTHORIZATION,
format!("Bearer {jwt_token}"),
);
}
let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
let response = response.error_from_body().await?;
response
.json()
.await
.map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
}
}
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let cli = Cli::parse();
@@ -337,7 +283,7 @@ async fn main() -> anyhow::Result<()> {
}
Command::TenantCreate { tenant_id } => {
storcon_client
.dispatch(
.dispatch::<_, ()>(
Method::POST,
"v1/tenant".to_string(),
Some(TenantCreateRequest {
@@ -357,13 +303,16 @@ async fn main() -> anyhow::Result<()> {
tracing::info!("Delete status: {}", status);
}
Command::Nodes {} => {
let resp = storcon_client
let mut resp = storcon_client
.dispatch::<(), Vec<NodeDescribeResponse>>(
Method::GET,
"control/v1/node".to_string(),
None,
)
.await?;
resp.sort_by(|a, b| a.listen_http_addr.cmp(&b.listen_http_addr));
let mut table = comfy_table::Table::new();
table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
for node in resp {
@@ -395,13 +344,16 @@ async fn main() -> anyhow::Result<()> {
.await?;
}
Command::Tenants {} => {
let resp = storcon_client
let mut resp = storcon_client
.dispatch::<(), Vec<TenantDescribeResponse>>(
Method::GET,
"control/v1/tenant".to_string(),
None,
)
.await?;
resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id));
let mut table = comfy_table::Table::new();
table.set_header([
"TenantId",
@@ -650,6 +602,11 @@ async fn main() -> anyhow::Result<()> {
.dispatch::<(), ()>(Method::POST, format!("debug/v1/node/{node_id}/drop"), None)
.await?;
}
Command::NodeDelete { node_id } => {
storcon_client
.dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
.await?;
}
Command::TenantSetTimeBasedEviction {
tenant_id,
period,

View File

@@ -33,7 +33,7 @@ echo $result | jq .
generate_id timeline_id
PARAMS=(
-sb
-sbf
-X POST
-H "Content-Type: application/json"
-d "{\"new_timeline_id\": \"${timeline_id}\", \"pg_version\": ${PG_VERSION}}"

View File

@@ -31,25 +31,14 @@ services:
restart: always
image: ${REPOSITORY:-neondatabase}/neon:${TAG:-latest}
environment:
- BROKER_ENDPOINT='http://storage_broker:50051'
- AWS_ACCESS_KEY_ID=minio
- AWS_SECRET_ACCESS_KEY=password
#- RUST_BACKTRACE=1
ports:
#- 6400:6400 # pg protocol handler
- 9898:9898 # http endpoints
entrypoint:
- "/bin/sh"
- "-c"
command:
- "/usr/local/bin/pageserver -D /data/.neon/
-c \"broker_endpoint=$$BROKER_ENDPOINT\"
-c \"listen_pg_addr='0.0.0.0:6400'\"
-c \"listen_http_addr='0.0.0.0:9898'\"
-c \"remote_storage={endpoint='http://minio:9000',
bucket_name='neon',
bucket_region='eu-north-1',
prefix_in_bucket='/pageserver/'}\""
volumes:
- ./pageserver_config:/data/.neon/
depends_on:
- storage_broker
- minio_create_buckets

View File

@@ -0,0 +1 @@
id=1234

View File

@@ -0,0 +1,5 @@
broker_endpoint='http://storage_broker:50051'
pg_distrib_dir='/usr/local/'
listen_pg_addr='0.0.0.0:6400'
listen_http_addr='0.0.0.0:9898'
remote_storage={ endpoint='http://minio:9000', bucket_name='neon', bucket_region='eu-north-1', prefix_in_bucket='/pageserver' }

View File

@@ -0,0 +1,252 @@
# Ancestor Timeline Deletion
Created on: 2024-02-23
Author: John Spray
# Summary
When a tenant creates a new timeline that they will treat as their 'main' history,
it is awkward to permanently retain an 'old main' timeline as its ancestor. Currently
this is necessary because it is forbidden to delete a timeline which has descendents.
A new pageserver API is proposed to 'adopt' data from a parent timeline into
one of its children, such that the link between ancestor and child can be severed,
leaving the parent in a state where it may then be deleted.
# Motivation
Retaining parent timelines currently has two costs:
- Cognitive load on users, who have to remember which is the "real" main timeline.
- Storage capacity cost, as the parent timeline will retain layers up to the
child's timeline point, even if the child fully covers its keyspace with image
layers and will never actually read from the parent.
# Solution
A new pageserver API `PUT /v1/tenant/:tenant_id/timeline/:timeline_id/detach_ancestor`
will be added. The `timeline_id` in this URL is that of the _child_ timeline that we
wish to detach from its parent.
On success, this API will leave the following state:
- The detached child timeline will no longer have an ancestor, and will contain all
the data needed to service reads without recursing into an ancestor.
- Any other children of the parent whose timeline points were at a lower LSN than
the detached child timeline will be modified to have the child timeline as their
new parent.
- The parent timeline will still exist, but the child will no longer have it as an
ancestor. If this was the last timeline that depended on the parent, then the
parent will become deletable.
This API's implementation will consist of a series of retryable steps, such that
on failures/timeout it can safely be called again to reach the target state.
## Example
### Before
The user has "rolled back" their project to LSN X, resulting in a "new main"
timeline. The parent "old main" timeline still exists, and they would like
to clean it up.
They have two other timelines A and B. A is from before the rollback point,
and B is from after the rollback point.
```
----"old main" timeline-------X-------------------------------------------->
| | |
|-> child A | |
|-> "new main" timeline |
-> child B
```
### After calling detach ancestor API
The "new main" timeline is no longer dependent on old main, and neither
is child A, because it had a branch point before X.
The user may now choose to delete child B and "old main" to get to
a pristine state. Child B is likely to be unwanted since the user
chose to roll back to X, and it branches from after X. However, we
don't assume this in the API; it is up to the user to delete it.
```
|----"old main" timeline---------------------------------------------------->
|
|
|
-> child B
|----"new main" timeline--------->
|
|-> child A
```
### After removing timelines
We end up with a totally clean state that leaves no trace that a rollback
ever happened: there is only one root timeline.
```
| ----"new main" timeline----------->
|
|-> child A
```
## Caveats
Important things for API users to bear in mind:
- this API does not delete the parent timeline: you must still do that explicitly.
- if there are other child timelines ahead of the branch point of the detached
child, the parent won't be deletable: you must either delete or detach those
children.
- do _not_ simply loop over all children and detach them all: this can have an
extremely high storage cost. The detach ancestor API is intended for use on a single
timeline to make it the new "main".
- The detach ancestor API should also not be
exposed directly to the user as button/API, because they might decide
to click it for all the children and thereby generate many copies of the
parent's data -- the detach ancestor API should be used as part
of a high level "clean up after rollback" feature.
## `detach_ancestor` API implementation
Terms used in the following sections:
- "the child": the timeline whose ID is specified in the detach ancestor API URL, also
called "new main" in the example.
- "the parent": the parent of "the child". Also called "old main" in the example.
- "the branch point" the ancestor_lsn of "the child"
### Phase 1: write out adopted layers to S3
The child will "adopt" layers from the parent, such that its end state contains
all the parent's history as well as its own.
For all layers in the parent's layer map whose high LSN is below the branch
point, issue S3 CopyObject requests to duplicate them into the child timeline's
prefix. Do not add them to the child's layer map yet.
For delta layers in the parent's layer map which straddle the branch point, read them
and write out only content up to the branch point into new layer objects.
This is a long running operation if the parent has many layers: it should be
implemented in a way that resumes rather than restarting from scratch, if the API
times out and is called again.
As an optimization, if there are no other timelines that will be adopted into
the child, _and_ the child's image layers already full cover the branch LSN,
then we may skip adopting layers.
### Phase 2: update the child's index
Having written out all needed layers in phase 1, atomically link them all
into the child's IndexPart and upload to S3. This may be done while the
child Timeline is still running.
### Phase 3: modify timelines ancestry
Modify the child's ancestor to None, and upload its IndexPart to persist the change.
For all timelines which have the same parent as the child, and have a branch
point lower than our branch point, switch their ancestor_timeline to the child,
and upload their IndexPart to persist the change.
## Alternatives considered
### Generate full image layer on child, rather than adopting parent deltas
This would work for the case of a single child, but would prevent re-targeting
other timelines that depended on the parent. If we detached many children this
way, the storage cost would become prohibitive (consider a 1TB database with
100 child timelines: it would cost 100TiB if they all generated their own image layers).
### Don't rewrite anything: just fake it in the API
We could add a layer of indirection that let a child "pretend" that it had no
ancestor, when in reality it still had the parent. The pageserver API could
accept deletion of ancestor timelines, and just update child metadata to make
them look like they have no ancestor.
This would not achieve the desired reduction in storage cost, and may well be more
complex to maintain than simply implementing the API described in this RFC.
### Avoid copying objects: enable child index to use parent layers directly
We could teach IndexPart to store a TimelineId for each layer, such that a child
timeline could reference a parent's layers directly, rather than copying them
into the child's prefix.
This would impose a cost for the normal case of indices that only target the
timeline's own layers, add complexity, and break the useful simplifying
invariant that timelines "own" their own path. If child timelines were
referencing layers from the parent, we would have to ensure that the parent
never runs GC/compaction again, which would make the API less flexible (the
proposal in this RFC enables deletion of the parent but doesn't require it.)
## Performance
### Adopting layers
- CopyObject is a relatively cheap operation, but we may need to issue tens of thousands
of such requests: this can take up to tens of seconds and will compete for RemoteStorage
semaphore units with other activity on the pageserver.
- If we are running on storage backend that doesn't implement CopyObject, then
this part will be much more expensive as we would stream all layer content
through the pageserver. This is no different to issuing a lot
of reads to a timeline that does not have a warm local cache: it will move
a lot of gigabytes, but that shouldn't break anything.
- Generating truncated layers for delta that straddle the branch point will
require streaming read/write of all the layers in question.
### Updating timeline ancestry
The simplest way to update timeline ancestry will probably be to stop and start
all the Timeline objects: this is preferable to the complexity of making their
ancestry mutable at runtime.
There will be a corresponding "stutter" in the availability of the timelines,
of the order 10-100ms, which is the time taken to upload their IndexPart, and
restart the Timeline.
# Interaction with other features
## Concurrent timeline creation
If new historic timelines are created using the parent as an ancestor while the
detach ancestor API is running, they will not be re-parented to the child. This
doesn't break anything, but it leaves the parent in a state where it might not
be possible to delete it.
Since timeline creations are an explicit user action, this is not something we need to
worry about as the storage layer: a user who wants to delete their parent timeline will not create
new children, and if they do, they can choose to delete those children to
enable deleting the parent.
For the least surprise to the user, before starting the detach ancestor branch
operation, the control plane should wait until all branches are created and not
allow any branches to be created before the branch point on the ancestor branch
while the operation is ongoing.
## WAL based disaster recovery
WAL based disaster recovery currently supports only restoring of the main
branch. Enabling WAL based disaster recovery in the future requires that we
keep a record which timeline generated the WAL and at which LSN was a parent
detached. Keep a list of timeline ids and the LSN in which they were detached in
the `index_part.json`. Limit the size of the list to 100 first entries, after
which the WAL disaster recovery will not be possible.
## Sharded tenants
For sharded tenants, calls to the detach ancestor API will pass through the storage
controller, which will handle them the same as timeline creations: invoke first
on shard zero, and then on all the other shards.

View File

@@ -0,0 +1,507 @@
# Timeline Archival
## Summary
This RFC describes a mechanism for pageservers to eliminate local storage + compute work
for timelines which are not in use, in response to external API calls to "archive" a timeline.
The archived state roughly corresponds to fully offloading a timeline to object storage, such
that its cost is purely the cost of that object storage.
## Motivation
Archived timelines serve multiple purposes:
- Act as a 'snapshot' for workloads that would like to retain restorable copies of their
database from longer ago than their PITR window.
- Enable users to create huge numbers of branches (e.g. one per github PR) without having
to diligently clean them up later to avoid overloading the pageserver (currently we support
up to ~500 branches per tenant).
### Prior art
Most storage and database systems have some form of snapshot, which can be implemented several ways:
1. full copies of data (e.g. an EBS snapshot to S3)
2. shallow snapshots which are CoW relative to the original version of the data, e.g. on a typical NFS appliance, or a filesystem like CephFS.
3. a series of snapshots which are CoW or de-duplicated relative to one another.
Today's Neon branches are approximately like `2.`, although due to implementation details branches
often end up storing much more data than they really need, as parent branches assume that all data
at the branch point is needed. The layers pinned in the parent branch may have a much larger size
than the physical size of a compressed image layer representing the data at the branch point.
## Requirements
- Enter & exit the archived state in response to external admin API calls
- API calls to modify the archived state are atomic and durable
- An archived timeline should eventually (once out of PITR window) use an efficient compressed
representation, and avoid retaining arbitrarily large data in its parent branch.
- Remote object GETs during tenant start may be O(N) with the number of _active_ branches,
but must not scale with the number of _archived_ branches.
- Background I/O for archived branches should only be done a limited number of times to evolve them
to a long-term-efficient state (e.g. rewriting to image layers). There should be no ongoing "housekeeping"
overhead for archived branches, including operations related to calculating sizes for billing.
- The pageserver should put no load on the safekeeper for archived branches.
- Performance of un-archiving a branch must make good use of S3/disk bandwidth to restore the branch
to a performant state in a short time (linear with the branch's logical size)
## Non Goals
- Archived branches are not a literal `fullbackup` postgres snapshot: they are still stored
in Neon's internal format.
- Compute cold starts after activating an archived branch will not have comparable performance to
cold starts on an active branch.
- Archived branches will not use any new/additional compression or de-duplication beyond what
is already implemented for image layers (zstd per page).
- The pageserver will not "auto start" archived branches in response to page_service API requests: they
are only activated explicitly via the HTTP API.
- We will not implement a total offload of archived timelines from safekeepers: their control file (small) will
remain on local disk, although existing eviction mechanisms will remove any segments from local disk.
- We will not expose any prometheus metrics for archived timelines, or make them visible in any
detailed HTTP APIs other than the specific API for listing archived timelines.
- A parent branch may not be archived unless all its children are.
## Impacted Components
pageserver, storage controller
## Terminology
**Archived**: a branch is _archived_ when an HTTP API request to archive it has succeeded: the caller
may assume that this branch is now very cheap to store, although this may not be physically so until the
branch proceeds to the offloaded state.
**Active** branches are branches which are available for use by page_service clients, and have a relatively
high cost due to consuming local storage.
**Offloaded** branches are a subset of _archived_ branches, which have had their local state removed such
that they now consume minimal runtime resources and have a cost similar to the cost of object storage.
**Activate** (verb): transition from Archived to Active
**Archive** (verb): transition from Active to Archived
**Offload** (verb): transition from Archived to Offloaded
**Offload manifest**: an object stored in S3 that describes timelines which pageservers do not load.
**Warm up** (verb): operation done on an active branch, by downloading its active layers. Once a branch is
warmed up, good performance will be available to page_service clients.
## Implementation
### High level flow
We may think of a timeline which is archived and then activated as proceeding through a series of states:
```mermaid
stateDiagram
[*] --> Active(warm)
Active(warm) --> Archived
Archived --> Offloaded
Archived --> Active(warm)
Offloaded --> Active(cold)
Active(cold) --> Active(warm)
```
Note that the transition from Archived to Active(warm) is expected to be fairly rare: the most common lifecycles
of branches will be:
- Very frequent: Short lived branches: Active -> Deleted
- Frequent: Long-lived branches: Active -> Archived -> Offloaded -> Deleted
- Rare: Branches used to restore old state: Active ->Archived -> Offloaded -> Active
These states are _not_ all stored as a single physical state on the timeline, but rather represent the combination
of:
- the timeline's lifecycle state: active or archived, stored in the timeline's index
- its offload state: whether pageserver has chosen to drop local storage of the timeline and write it into the
manifest of offloaded timelines.
- cache state (whether it's warm or cold).
### Storage format changes
There are two storage format changes:
1. `index_part.json` gets a new attribute `state` that describes whether the timeline is to
be considered active or archived.
2. A new tenant-level _manifest_ object `tenant_manifest-v1.json` describes which timelines a tenant does not need to load
at startup (and is available for storing other small, rarely changing tenant-wide attributes in future)
The manifest object will have a format like this:
```
{
"offload_timelines": [
{
"timeline_id": ...
"last_record_lsn": ...
"last_record_lsn_time": ...
"pitr_interval": ...
"last_gc_lsn": ... # equal to last_record_lsn if this branch has no history (i.e. a snapshot)
"logical_size": ... # The size at last_record_lsn
"physical_size" ...
"parent": Option<{
"timeline_id"...
"lsn"... # Branch point LSN on the parent
"requires_data": bool # True if this branch depends on layers in its parent, identify it here
}>
}
]
}
```
The information about a timeline in its offload state is intentionally minimal: just enough to decide:
- Whether it requires [archive optimization](#archive-branch-optimization) by rewriting as a set of image layers: we may infer this
by checking if now > last_record_lsn_time - pitr_interval, and pitr_lsn < last_record_lsn.
- Whether a parent branch should include this offloaded branch in its GC inputs to avoid removing
layers that the archived branch depends on
- Whether requests to delete this `timeline_id` should be executed (i.e. if a deletion request
is received for a timeline_id that isn't in the site of live `Timelines` or in the manifest, then
we don't need to go to S3 for the deletion.
- How much archived space to report in consumption metrics
The contents of the manifest's offload list will also be stored as an attribute of `Tenant`, such that the total
set of timelines may be found by the union of `Tenant::timelines` (non-offloaded timelines) and `Tenant::offloaded`
(offloaded timelines).
For split-brain protection, the manifest object will be written with a generation suffix, in the same way as
index_part objects are (see [generation numbers RFC](025-generation-numbers.md)). This will add some complexity, but
give us total safety against two pageservers with the same tenant attached fighting over the object. Existing code
for finding the latest generation and for cleaning up old generations (in the scrubber) will be generalized to cover
the manifest file.
### API & Timeline state
Timelines will store a lifecycle state (enum of Active or Archived) in their IndexPart. This will
be controlled by a new per-timeline `configure` endpoint. This is intentionally generic naming, which
may be used in future to control other per-timeline attributes (e.g. in future we may make PITR interval
a per-timeline configuration).
`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configure`
```
{
'state': 'active|archive'
}
```
When archiving a timeline, this API will complete as soon as the timeline's state has been set in index_part, and that index has been uploaded.
When activating a timeline, this API will complete as soon as the timeline's state has been set in index_part,
**and** the `Timeline` object has been instantiated and activated. This will require reading the timeline's
index, but not any data: it should be about as fast as a couple of small S3 requests.
The API will be available with identical path via the storage controller: calling this on a sharded tenant
will simply map the API call to all the shards.
Archived timelines may never have descendent timelines which are active. This will be enforced at the API level,
such that activating a timeline requires that all its ancestors are active, and archiving a timeline requires
that all its descendents are archived. It is the callers responsibility to walk the hierarchy of timelines
in the proper order if they would like to archive whole trees of branches.
Because archive timelines will be excluded from the usual timeline listing APIs, a new API specifically
for archived timelines will be added: this is for use in support/debug:
```
GET /v1/tenants/{tenant_id}/archived_timelines
{
...same per-timeline content as the tenant manifest...
}
```
### Tenant attach changes
Currently, during Tenant::spawn we list all the timelines in the S3 bucket, and then for each timeline
we load their index_part.json. To avoid the number of GETs scaling linearly with the number of archived
timelines, we must have a single object that tells us which timelines do not need to be loaded. The
number of ListObjects requests while listing timelines will still scale O(N), but this is less problematic
because each request covers 1000 timelines.
This is **not** literally the same as the set of timelines who have state=archived. Rather, it is
the set of timelines which have been offloaded in the background after their state was set to archived.
We may simply skip loading these timelines: there will be no special state of `Timeline`, they just won't
exist from the perspective of an active `Tenant` apart from in deletion: timeline deletion will need
to check for offloaded timelines as well as active timelines, to avoid wrongly returning 404 on trying
to delete an offloaded timeline.
### Warm-up API
`PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/download?wait_ms=1234`
This API will be similar to the existing `download_remote_layers` API, but smarter:
- It will not download _all_ remote layers, just the visible set (i.e. layers needed for a read)
- It will download layers in the visible set until reaching `wait_ms`, then return a struct describing progress
of downloads, so that the caller can poll.
The _visible set_ mentioned above will be calculated by the pageserver in the background, by taking the set
of readable LSNs (i.e. branch points and heads of branches), and walking the layer map to work out which layers
can possibly be read from these LSNs. This concept of layer visibility is more generally useful for cache
eviction and heatmaps, as well as in this specific case of warming up a timeline.
The caller does not have to wait for the warm up API, or call it at all. But it is strongly advised
to call it, because otherwise populating local contents for a timeline can take a long time when waiting
for SQL queries to coincidentally hit all the layers, and during that time query latency remains quite
volatile.
### Background work
Archived branches are not subject to normal compaction. Instead, when the compaction loop encounters
an archived branch, it will consider rewriting the branch to just image layers if the branch has no history
([archive branch optimization](#archive-branch-optimization)), or offloading the timeline from local disk
if its state permits that.
Additionally, the tenant compaction task will walk the state of already offloaded timelines to consider
optimizing their storage, e.g. if a timeline had some history when offloaded, but since then its PITR
has elapsed and it can now be rewritten to image layers.
#### Archive branch offload
Recall that when we archive a timeline via the HTTP API, this only sets a state: it doesn't do
any actual work.
This work is done in the background compaction loop. It makes sense to tag this work on to the compaction
loop, because it is spiritually aligned: offloading data for archived branches improves storage efficiency.
The condition for offload is simple:
- a `Timeline` object exists with state `Archived`
- the timeline does not have any non-offloaded children.
Regarding the condition that children must be offloaded, this will always be eventually true, because
we enforce at the API level that children of archived timelines must themselves be archived, and all
archived timelines will eventually be offloaded.
Offloading a timeline is simple:
- Read the timeline's attributes that we will store in its offloaded state (especially its logical size)
- Call `shutdown()` on the timeline and remove it from the `Tenant` (as if we were about to delete it)
- Erase all the timeline's content from local storage (`remove_dir_all` on its path)
- Write the tenant manifest to S3 to prevent this timeline being loaded on next start.
#### Archive branch optimization (flattening)
When we offloaded a branch, it might have had some history that prevented rewriting it to a single
point in time set of image layers. For example, a branch might have several days of writes and a 7
day PITR: when we archive it, it still has those days of history.
Once the PITR has expired, we have an opportunity to reduce the physical footprint of the branch by:
- Writing compressed image layers within the archived branch, as these are more efficient as a way of storing
a point in time compared with delta layers
- Updating the branch's offload metadata to indicate that this branch no longer depends on its ancestor
for data, i.e. the ancestor is free to GC layers files at+below the branch point
Fully compacting an archived branch into image layers at a single LSN may be thought of as *flattening* the
branch, such that it is now a one-dimensional keyspace rather than a two-dimensional key/lsn space. It becomes
a true snapshot at that LSN.
It is not always more efficient to flatten a branch than to keep some extra history on the parent: this
is described in more detail in [optimizations](#delaying-storage-optimization-if-retaining-parent-layers-is-cheaper)
Archive branch optimization should be done _before_ background offloads during compaction, because there may
be timelines which are ready to be offloaded but also would benefit from the optimization step before
being offloaded. For example, a branch which has already fallen out of PITR window and has no history
of its own may be immediately re-written as a series of image layers before being offloaded.
### Consumption metrics
Archived timelines and offloaded timelines will be excluded from the synthetic size calculation, in anticipating
that billing structures based on consumption metrics are highly likely to apply different $/GB rates to archived
vs. ordinary content.
Archived and offloaded timelines' logical size will be reported under the existing `timeline_logical_size`
variant of `MetricsKey`: receivers are then free to bill on this metric as they please.
### Secondary locations
Archived timelines (including offloaded timelines) will be excluded from heatmaps, and thereby
when a timeline is archived, after the next cycle of heatmap upload & secondary download, its contents
will be dropped from secondary locations.
### Sharding
Archiving or activating a timeline will be done symmetrically across all shards in a tenant, in
the same way that timeline creation and deletion is done. There are no special rules about ordering:
the storage controller may dispatch concurrent calls to all shards when archiving or activating a timeline.
Since consumption metrics are only transmitted from shard zero, the state of archival on this shard
will be authoritative for consumption metrics.
## Error cases
### Errors in sharded tenants
If one shard in a tenant fails an operation but others succeed, the tenant may end up in a mixed
state, where a timeline is archived on some shards but not on others.
We will not bother implementing a rollback mechanism for this: errors in archiving/activating a timeline
are either transient (e.g. S3 unavailable, shutting down), or the fault of the caller (NotFound, BadRequest).
In the transient case callers are expected to retry until success, or to make appropriate API calls to clear
up their mistake. We rely on this good behavior of callers to eventually get timelines into a consistent
state across all shards. If callers do leave a timeline in an inconsistent state across shards, this doesn't
break anything, it's just "weird".
This is similar to the status quo for timeline creation and deletion: callers are expected to retry
these operations until they succeed.
### Archiving/activating
Archiving/activating a timeline can fail in a limited number of ways:
1. I/O error storing/reading the timeline's updated index
- These errors are always retryable: a fundamental design assumption of the pageserver is that remote
storage errors are always transient.
2. NotFound if the timeline doesn't exist
- Callers of the API are expected to avoid calling deletion and archival APIs concurrently.
- The storage controller has runtime locking to prevent races such as deleting a timeline while
archiving it.
3. BadRequest if the rules around ancestors/descendents of archived timelines would be violated
- Callers are expected to do their own checks to avoid hitting this case. If they make
a mistake and encounter this error, they should give up.
### Offloading
Offloading can only fail if remote storage is unavailable, which would prevent us from writing the
tenant manifest. In such error cases, we give up in the expectation that offloading will be tried
again at the next iteration of the compaction loop.
### Archive branch optimization
Optimization is a special form of compaction, so can encounter all the same errors as regular compaction
can: it should return Result<(), CompactionError>, and as with compaction it will be retried on
the next iteration of the compaction loop.
## Optimizations
### Delaying storage optimization if retaining parent layers is cheaper
Optimizing archived branches to image layers and thereby enabling parent branch GC to progress
is a safe default: archived branches cannot over-fill a pageserver's local disk, and once they
are offloaded to S3 they're totally safe, inert things.
However, in some cases it can be advantageous to retain extra history on their parent branch rather
than flattening the archived branch. For example, if a 1TB parent branch is rather slow-changing (1GB
of data per day), and archive branches are being created nightly, then writing out full 1TB image layers
for each nightly branch is inefficient compared with just keeping more history on the main branch.
Getting this right requires consideration of:
- Compaction: if keeping more history on the main branch is going to prompt the main branch's compaction to
write out extra image layers, then it might make more sense to just write out the image layers on
the archived branch.
- Metadata bloat: keeping extra history on a parent branch doesn't just cost GB of storage, it makes
the layer map (and index_part) bigger. There are practical limits beyond which writing an indefinitely
large layer map can cause problems elsewhere.
This optimization can probably be implemented quite cheaply with some basic heuristics like:
- don't bother doing optimization on an archive branch if the LSN distance between
its branch point and the end of the PITR window is <5% of the logical size of the archive branch.
- ...but, Don't keep more history on the main branch than double the PITR
### Creating a timeline in archived state (a snapshot)
Sometimes, one might want to create a branch with no history, which will not be written to
before it is archived. This is a snapshot, although we do not require a special snapshot API,
since a snapshot can be represented as a timeline with no history.
This can be accomplished by simply creating a timeline and then immediately archiving it, but
that is somewhat wasteful: this timeline it will spin up various tasks and open a connection to the storage
broker to try and ingest WAL, before being shutdown in the subsequent archival call. To explicitly
support this common special case, we may add a parameter to the timeline creation API which
creates a timeline directly into the archived state.
Such a timeline creation will do exactly two I/Os at creation time:
- write the index_part object to record the timeline's existence
- when the timeline is offloaded in the next iteration of the compaction loop (~20s later),
write the tenant manifest.
Later, when the timeline falls off the end of the PITR interval, the usual offload logic will wake
up the 'snapshot' branch and write out image layers.
## Future Work
### Enabling `fullbackup` dumps from archive branches
It would be useful to be able to export an archive branch to another system, or for use in a local
postgres database.
This could be implemented as a general capability for all branches, in which case it would "just work"
for archive branches by activating them. However, downloading all the layers in a branch just to generate
a fullbackup is a bit inefficient: we could implement a special case for flattened archived branches
which streams image layers from S3 and outputs the fullbackup stream without writing the layers out to disk.
Implementing `fullbackup` is a bit more complicated than this because of sharding, but solving that problem
is unrelated to the topic of archived branches (it probably involves having each shard write out a fullbackup
stream to S3 in an intermediate format and, then having one node stitch them together).
### Tagging layers from archived branches
When we know a layer is an image layer written for an archived branch that has fallen off the PITR window,
we may add tags to the S3 objects to enable writing lifecycle policies that transition such layers to even
cheaper storage.
This could be done for all archived layers, or it could be driven by the archival API, to give the pageserver
external hints on which branches are likely to be reactivated, and which branches are good candidates for
tagging for low performance storage.
Tagging+lifecycles is just one mechanism: one might also directly use S3 storage classes. Other clouds' object
stores have similar mechanisms.
### Storing sequences of archive branches as deltas
When archived branches are used as scheduled snapshots, we could store them even more efficiently
by encoding them as deltas relative to each other (i.e. for nightly snapshots, when we do the
storage optimization for Tuesday's snapshot, we would read Monday's snapshot and store only the modified
pages). This is the kind of encoding that many backup storage systems use.
The utility of this depends a lot on the churn rate of the data, and the cost of doing the delta encoding
vs. just writing out a simple stream of the entire database. For smaller databases, writing out a full
copy is pretty trivial (e.g. writing a compressed copy of a 10GiB database to S3 can take under 10 seconds,
so the complexity tradeoff of diff-encoding it is dubious).
One does not necessarily have to read-back the previous snapshot in order to encoded the next one: if the
pageserver knows about the schedule, it can intentionally retain extra history on the main branch so that
we can say: "A branch exists from Monday night. I have Monday night's data still active in the main branch,
so now I can read at the Monday LSN and the Tuesday LSN, calculate the delta, and store it as Tuesday's
delta snapshot".
Clearly this all requires careful housekeeping to retain the relationship between branches that depend on
each other: perhaps this would be done by making the archive branches have child/parent relationships with
each other, or perhaps we would permit them to remain children of their original parent, but additionally
have a relationship with the snapshot they're encoded relative to.
Activating a branch that is diff-encoded may require activating several earlier branches too, so figuring
out how frequently to write a full copy is important. This is essentially a zoomed-out version of what
we do with delta layers and image layers within a timeline, except each "layer" is a whole timeline.
## FAQ/Alternatives
### Store all timelines in the tenant manifest
Rather than special-casing offloaded timelines in the offload manifest, we could store a total
manifest of all timelines, eliminating the need for the pageserver to list timelines in S3 on
startup.
That would be a more invasive change (require hooking in to timeline creation), and would
generate much more I/O to this manifest for tenants that had many branches _and_ frequent
create/delete cycles for short lived branches. Restricting the manifest to offloaded timelines
means that we only have to cope with the rate at which long-lived timelines are archived, rather
than the rate at which sort lived timelines are created & destroyed.
### Automatically archiving/activating timelines without external API calls
We could implement TTL driven offload of timelines, waking them up when a page request
arrives.
This has downsides:
- Opacity: if we do TTL-driven offload inside the pageserver, then the end user doesn't
know which of their branches are in this state, and might get a surprise when they try
to use such a branch.
- Price fluctuation: if the archival of a branch is used in end user pricing, then users
prefer clarity & consistency. Ideally a branch's storage should cost the same from the moment it
is created, rather than having a usage-dependency storage price.
- Complexity: enabling the page service to call up into the Tenant to activate a timeline
would be awkward, compared with an external entry point.
### Make offloaded a state of Timeline
To reduce the operator-facing complexity of having some timelines APIs that only return
non-offloaded timelines, we could build the offloaded state into the Timeline type.
`timeline.rs` is already one of the most egregiously long source files in the tree, so
this is rejected on the basis that we need to avoid making that complexity worse.

View File

@@ -44,7 +44,7 @@ If you need to modify the database schema, heres how to create a migration:
- Use `diesel migration generate <name>` to create a new migration
- Populate the SQL files in the `migrations/` subdirectory
- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
- This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
- This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/storage_controller`
- Commit the migration files and the changes to schema.rs
- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once youve committed a migration no further steps are needed.

View File

@@ -87,7 +87,7 @@ pub struct TenantLocateResponse {
pub shard_params: ShardParameters,
}
#[derive(Serialize, Deserialize)]
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantDescribeResponse {
pub tenant_id: TenantId,
pub shards: Vec<TenantDescribeResponseShard>,
@@ -110,7 +110,7 @@ pub struct NodeDescribeResponse {
pub listen_pg_port: u16,
}
#[derive(Serialize, Deserialize)]
#[derive(Serialize, Deserialize, Debug)]
pub struct TenantDescribeResponseShard {
pub tenant_shard_id: TenantShardId,

View File

@@ -5,7 +5,6 @@ pub mod utilization;
pub use utilization::PageserverUtilization;
use std::{
borrow::Cow,
collections::HashMap,
io::{BufRead, Read},
num::{NonZeroU64, NonZeroUsize},
@@ -20,7 +19,6 @@ use serde::{Deserialize, Serialize};
use serde_with::serde_as;
use utils::{
completion,
history_buffer::HistoryBufferWithDropCounter,
id::{NodeId, TenantId, TimelineId},
lsn::Lsn,
serde_system_time,
@@ -294,7 +292,6 @@ pub struct TenantConfig {
pub walreceiver_connect_timeout: Option<String>,
pub lagging_wal_timeout: Option<String>,
pub max_lsn_wal_lag: Option<NonZeroU64>,
pub trace_read_requests: Option<bool>,
pub eviction_policy: Option<EvictionPolicy>,
pub min_resident_size_override: Option<u64>,
pub evictions_low_residence_duration_metric_threshold: Option<String>,
@@ -652,6 +649,17 @@ pub struct TenantDetails {
pub timelines: Vec<TimelineId>,
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Copy, Debug)]
pub enum TimelineArchivalState {
Archived,
Unarchived,
}
#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
pub struct TimelineArchivalConfigRequest {
pub state: TimelineArchivalState,
}
/// This represents the output of the "timeline_detail" and "timeline_list" API calls.
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct TimelineInfo {
@@ -716,58 +724,7 @@ pub struct LayerMapInfo {
pub historic_layers: Vec<HistoricLayerInfo>,
}
#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize, enum_map::Enum)]
#[repr(usize)]
pub enum LayerAccessKind {
GetValueReconstructData,
Iter,
KeyIter,
Dump,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LayerAccessStatFullDetails {
pub when_millis_since_epoch: u64,
pub task_kind: Cow<'static, str>,
pub access_kind: LayerAccessKind,
}
/// An event that impacts the layer's residence status.
#[serde_as]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LayerResidenceEvent {
/// The time when the event occurred.
/// NB: this timestamp is captured while the residence status changes.
/// So, it might be behind/ahead of the actual residence change by a short amount of time.
///
#[serde(rename = "timestamp_millis_since_epoch")]
#[serde_as(as = "serde_with::TimestampMilliSeconds")]
pub timestamp: SystemTime,
/// The new residence status of the layer.
pub status: LayerResidenceStatus,
/// The reason why we had to record this event.
pub reason: LayerResidenceEventReason,
}
/// The reason for recording a given [`LayerResidenceEvent`].
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub enum LayerResidenceEventReason {
/// The layer map is being populated, e.g. during timeline load or attach.
/// This includes [`RemoteLayer`] objects created in [`reconcile_with_remote`].
/// We need to record such events because there is no persistent storage for the events.
///
// https://github.com/rust-lang/rust/issues/74481
/// [`RemoteLayer`]: ../../tenant/storage_layer/struct.RemoteLayer.html
/// [`reconcile_with_remote`]: ../../tenant/struct.Timeline.html#method.reconcile_with_remote
LayerLoad,
/// We just created the layer (e.g., freeze_and_flush or compaction).
/// Such layers are always [`LayerResidenceStatus::Resident`].
LayerCreate,
/// We on-demand downloaded or evicted the given layer.
ResidenceChange,
}
/// The residence status of the layer, after the given [`LayerResidenceEvent`].
/// The residence status of a layer
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub enum LayerResidenceStatus {
/// Residence status for a layer file that exists locally.
@@ -777,23 +734,16 @@ pub enum LayerResidenceStatus {
Evicted,
}
impl LayerResidenceEvent {
pub fn new(status: LayerResidenceStatus, reason: LayerResidenceEventReason) -> Self {
Self {
status,
reason,
timestamp: SystemTime::now(),
}
}
}
#[serde_as]
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LayerAccessStats {
pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
pub task_kind_access_flag: Vec<Cow<'static, str>>,
pub first: Option<LayerAccessStatFullDetails>,
pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
#[serde_as(as = "serde_with::TimestampMilliSeconds")]
pub access_time: SystemTime,
#[serde_as(as = "serde_with::TimestampMilliSeconds")]
pub residence_time: SystemTime,
pub visible: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]

View File

@@ -1,6 +1,6 @@
use utils::id::TimelineId;
#[derive(Default, serde::Serialize)]
#[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)]
pub struct AncestorDetached {
pub reparented_timelines: Vec<TimelineId>,
}

View File

@@ -7,6 +7,7 @@ license.workspace = true
[dependencies]
anyhow.workspace = true
async-trait.workspace = true
async-stream.workspace = true
once_cell.workspace = true
aws-smithy-async.workspace = true
aws-smithy-types.workspace = true

View File

@@ -15,7 +15,7 @@ use std::time::SystemTime;
use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
use anyhow::Result;
use azure_core::request_options::{MaxResults, Metadata, Range};
use azure_core::RetryOptions;
use azure_core::{Continuable, RetryOptions};
use azure_identity::DefaultAzureCredential;
use azure_storage::StorageCredentials;
use azure_storage_blobs::blob::CopyStatus;
@@ -40,6 +40,7 @@ use crate::{
pub struct AzureBlobStorage {
client: ContainerClient,
container_name: String,
prefix_in_container: Option<String>,
max_keys_per_list_response: Option<NonZeroU32>,
concurrency_limiter: ConcurrencyLimiter,
@@ -85,6 +86,7 @@ impl AzureBlobStorage {
Ok(AzureBlobStorage {
client,
container_name: azure_config.container_name.to_owned(),
prefix_in_container: azure_config.prefix_in_container.to_owned(),
max_keys_per_list_response,
concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
@@ -238,6 +240,10 @@ impl AzureBlobStorage {
_ = cancel.cancelled() => Err(Cancelled),
}
}
pub fn container_name(&self) -> &str {
&self.container_name
}
}
fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
@@ -261,30 +267,30 @@ fn to_download_error(error: azure_core::Error) -> DownloadError {
}
impl RemoteStorage for AzureBlobStorage {
async fn list(
fn list_streaming(
&self,
prefix: Option<&RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> anyhow::Result<Listing, DownloadError> {
let _permit = self.permit(RequestKind::List, cancel).await?;
) -> impl Stream<Item = Result<Listing, DownloadError>> {
// get the passed prefix or if it is not set use prefix_in_bucket value
let list_prefix = prefix
.map(|p| self.relative_path_to_name(p))
.or_else(|| self.prefix_in_container.clone())
.map(|mut p| {
// required to end with a separator
// otherwise request will return only the entry of a prefix
if matches!(mode, ListingMode::WithDelimiter)
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
{
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
}
p
});
let op = async {
// get the passed prefix or if it is not set use prefix_in_bucket value
let list_prefix = prefix
.map(|p| self.relative_path_to_name(p))
.or_else(|| self.prefix_in_container.clone())
.map(|mut p| {
// required to end with a separator
// otherwise request will return only the entry of a prefix
if matches!(mode, ListingMode::WithDelimiter)
&& !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
{
p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
}
p
});
async_stream::stream! {
let _permit = self.permit(RequestKind::List, cancel).await?;
let mut builder = self.client.list_blobs();
@@ -300,21 +306,43 @@ impl RemoteStorage for AzureBlobStorage {
builder = builder.max_results(MaxResults::new(limit));
}
let response = builder.into_stream();
let response = response.into_stream().map_err(to_download_error);
let response = tokio_stream::StreamExt::timeout(response, self.timeout);
let response = response.map(|res| match res {
Ok(res) => res,
Err(_elapsed) => Err(DownloadError::Timeout),
});
let mut next_marker = None;
let mut response = std::pin::pin!(response);
'outer: loop {
let mut builder = builder.clone();
if let Some(marker) = next_marker.clone() {
builder = builder.marker(marker);
}
let response = builder.into_stream();
let response = response.into_stream().map_err(to_download_error);
let response = tokio_stream::StreamExt::timeout(response, self.timeout);
let response = response.map(|res| match res {
Ok(res) => res,
Err(_elapsed) => Err(DownloadError::Timeout),
});
let mut res = Listing::default();
let mut response = std::pin::pin!(response);
let mut max_keys = max_keys.map(|mk| mk.get());
while let Some(entry) = response.next().await {
let entry = entry?;
let mut max_keys = max_keys.map(|mk| mk.get());
let next_item = tokio::select! {
op = response.next() => Ok(op),
_ = cancel.cancelled() => Err(DownloadError::Cancelled),
}?;
let Some(entry) = next_item else {
// The list is complete, so yield it.
break;
};
let mut res = Listing::default();
let entry = match entry {
Ok(entry) => entry,
Err(e) => {
// The error is potentially retryable, so we must rewind the loop after yielding.
yield Err(e);
continue;
}
};
next_marker = entry.continuation();
let prefix_iter = entry
.blobs
.prefixes()
@@ -333,19 +361,19 @@ impl RemoteStorage for AzureBlobStorage {
assert!(mk > 0);
mk -= 1;
if mk == 0 {
return Ok(res); // limit reached
yield Ok(res); // limit reached
break 'outer;
}
max_keys = Some(mk);
}
}
yield Ok(res);
// We are done here
if next_marker.is_none() {
break;
}
}
Ok(res)
};
tokio::select! {
res = op => res,
_ = cancel.cancelled() => Err(DownloadError::Cancelled),
}
}

View File

@@ -26,7 +26,7 @@ use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use bytes::Bytes;
use futures::stream::Stream;
use futures::{stream::Stream, StreamExt};
use serde::{Deserialize, Serialize};
use tokio::sync::Semaphore;
use tokio_util::sync::CancellationToken;
@@ -160,13 +160,18 @@ pub struct Listing {
/// providing basic CRUD operations for storage files.
#[allow(async_fn_in_trait)]
pub trait RemoteStorage: Send + Sync + 'static {
/// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
/// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
/// List objects in remote storage, with semantics matching AWS S3's [`ListObjectsV2`].
///
/// The stream is guaranteed to return at least one element, even in the case of errors
/// (in that case it's an `Err()`), or an empty `Listing`.
///
/// The stream is not ending if it returns an error, as long as [`is_permanent`] returns false on the error.
/// The `next` function can be retried, and maybe in a future retry, there will be success.
///
/// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
/// from the absolute root of the bucket.
///
/// `mode` configures whether to use a delimiter. Without a delimiter all keys
/// `mode` configures whether to use a delimiter. Without a delimiter, all keys
/// within the prefix are listed in the `keys` of the result. With a delimiter, any "directories" at the top level of
/// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
/// returned in `keys` ().
@@ -175,13 +180,32 @@ pub trait RemoteStorage: Send + Sync + 'static {
/// will iteratively call listobjects until it runs out of keys. Note that this is not safe to use on
/// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
///
/// [`ListObjectsV2`]: <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>
/// [`is_permanent`]: DownloadError::is_permanent
fn list_streaming(
&self,
prefix: Option<&RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> impl Stream<Item = Result<Listing, DownloadError>>;
async fn list(
&self,
prefix: Option<&RemotePath>,
_mode: ListingMode,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Listing, DownloadError>;
) -> Result<Listing, DownloadError> {
let mut stream = std::pin::pin!(self.list_streaming(prefix, mode, max_keys, cancel));
let mut combined = stream.next().await.expect("At least one item required")?;
while let Some(list) = stream.next().await {
let list = list?;
combined.keys.extend_from_slice(&list.keys);
combined.prefixes.extend_from_slice(&list.prefixes);
}
Ok(combined)
}
/// Streams the local file contents into remote into the remote storage entry.
///
@@ -288,8 +312,8 @@ impl Debug for Download {
/// Every storage, currently supported.
/// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
#[derive(Clone)]
// Require Clone for `Other` due to https://github.com/rust-lang/rust/issues/26925
#[derive(Clone)]
pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
LocalFs(LocalFs),
AwsS3(Arc<S3Bucket>),
@@ -298,13 +322,14 @@ pub enum GenericRemoteStorage<Other: Clone = Arc<UnreliableWrapper>> {
}
impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
// See [`RemoteStorage::list`].
pub async fn list(
&self,
prefix: Option<&RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> anyhow::Result<Listing, DownloadError> {
) -> Result<Listing, DownloadError> {
match self {
Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await,
Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await,
@@ -313,6 +338,23 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
}
}
// See [`RemoteStorage::list_streaming`].
pub fn list_streaming<'a>(
&'a self,
prefix: Option<&'a RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &'a CancellationToken,
) -> impl Stream<Item = Result<Listing, DownloadError>> + 'a {
match self {
Self::LocalFs(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel))
as Pin<Box<dyn Stream<Item = Result<Listing, DownloadError>>>>,
Self::AwsS3(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
Self::AzureBlob(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
Self::Unreliable(s) => Box::pin(s.list_streaming(prefix, mode, max_keys, cancel)),
}
}
/// See [`RemoteStorage::upload`]
pub async fn upload(
&self,
@@ -443,7 +485,7 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
}
impl GenericRemoteStorage {
pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
let timeout = storage_config.timeout;
Ok(match &storage_config.storage {
RemoteStorageKind::LocalFs { local_path: path } => {
@@ -458,7 +500,7 @@ impl GenericRemoteStorage {
std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?))
Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout).await?))
}
RemoteStorageKind::AzureContainer(azure_config) => {
let storage_account = azure_config
@@ -504,6 +546,16 @@ impl GenericRemoteStorage {
None => self.download(from, cancel).await,
}
}
/// The name of the bucket/container/etc.
pub fn bucket_name(&self) -> Option<&str> {
match self {
Self::LocalFs(_s) => None,
Self::AwsS3(s) => Some(s.bucket_name()),
Self::AzureBlob(s) => Some(s.container_name()),
Self::Unreliable(_s) => None,
}
}
}
/// Extra set of key-value pairs that contain arbitrary metadata about the storage entry.

View File

@@ -331,6 +331,17 @@ impl LocalFs {
}
impl RemoteStorage for LocalFs {
fn list_streaming(
&self,
prefix: Option<&RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> impl Stream<Item = Result<Listing, DownloadError>> {
let listing = self.list(prefix, mode, max_keys, cancel);
futures::stream::once(listing)
}
async fn list(
&self,
prefix: Option<&RemotePath>,

View File

@@ -16,16 +16,10 @@ use std::{
use anyhow::{anyhow, Context as _};
use aws_config::{
environment::credentials::EnvironmentVariableCredentialsProvider,
imds::credentials::ImdsCredentialsProvider,
meta::credentials::CredentialsProviderChain,
profile::ProfileFileCredentialsProvider,
provider_config::ProviderConfig,
default_provider::credentials::DefaultCredentialsChain,
retry::{RetryConfigBuilder, RetryMode},
web_identity_token::WebIdentityTokenCredentialsProvider,
BehaviorVersion,
};
use aws_credential_types::provider::SharedCredentialsProvider;
use aws_sdk_s3::{
config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep},
error::SdkError,
@@ -76,40 +70,27 @@ struct GetObjectRequest {
}
impl S3Bucket {
/// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
pub fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
pub async fn new(remote_storage_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
tracing::debug!(
"Creating s3 remote storage for S3 bucket {}",
remote_storage_config.bucket_name
);
let region = Some(Region::new(remote_storage_config.bucket_region.clone()));
let region = Region::new(remote_storage_config.bucket_region.clone());
let region_opt = Some(region.clone());
let provider_conf = ProviderConfig::without_region().with_region(region.clone());
let credentials_provider = {
// uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
CredentialsProviderChain::first_try(
"env",
EnvironmentVariableCredentialsProvider::new(),
)
// uses "AWS_PROFILE" / `aws sso login --profile <profile>`
.or_else(
"profile-sso",
ProfileFileCredentialsProvider::builder()
.configure(&provider_conf)
.build(),
)
// uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
// needed to access remote extensions bucket
.or_else(
"token",
WebIdentityTokenCredentialsProvider::builder()
.configure(&provider_conf)
.build(),
)
// uses imds v2
.or_else("imds", ImdsCredentialsProvider::builder().build())
};
// https://docs.aws.amazon.com/sdkref/latest/guide/standardized-credentials.html
// https://docs.rs/aws-config/latest/aws_config/default_provider/credentials/struct.DefaultCredentialsChain.html
// Incomplete list of auth methods used by this:
// * "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
// * "AWS_PROFILE" / `aws sso login --profile <profile>`
// * "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
// * http (ECS/EKS) container credentials
// * imds v2
let credentials_provider = DefaultCredentialsChain::builder()
.region(region)
.build()
.await;
// AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
@@ -118,9 +99,9 @@ impl S3Bucket {
#[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
BehaviorVersion::v2023_11_09(),
)
.region(region)
.region(region_opt)
.identity_cache(IdentityCache::lazy().build())
.credentials_provider(SharedCredentialsProvider::new(credentials_provider))
.credentials_provider(credentials_provider)
.sleep_impl(SharedAsyncSleep::from(sleep_impl));
let sdk_config: aws_config::SdkConfig = std::thread::scope(|s| {
@@ -405,6 +386,10 @@ impl S3Bucket {
}
Ok(())
}
pub fn bucket_name(&self) -> &str {
&self.bucket_name
}
}
pin_project_lite::pin_project! {
@@ -482,17 +467,16 @@ impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
}
impl RemoteStorage for S3Bucket {
async fn list(
fn list_streaming(
&self,
prefix: Option<&RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> Result<Listing, DownloadError> {
) -> impl Stream<Item = Result<Listing, DownloadError>> {
let kind = RequestKind::List;
// s3 sdk wants i32
let mut max_keys = max_keys.map(|mk| mk.get() as i32);
let mut result = Listing::default();
// get the passed prefix or if it is not set use prefix_in_bucket value
let list_prefix = prefix
@@ -504,89 +488,99 @@ impl RemoteStorage for S3Bucket {
})
});
let _permit = self.permit(kind, cancel).await?;
async_stream::stream! {
let _permit = self.permit(kind, cancel).await?;
let mut continuation_token = None;
let mut continuation_token = None;
'outer: loop {
let started_at = start_measuring_requests(kind);
loop {
let started_at = start_measuring_requests(kind);
// min of two Options, returning Some if one is value and another is
// None (None is smaller than anything, so plain min doesn't work).
let request_max_keys = self
.max_keys_per_list_response
.into_iter()
.chain(max_keys.into_iter())
.min();
let mut request = self
.client
.list_objects_v2()
.bucket(self.bucket_name.clone())
.set_prefix(list_prefix.clone())
.set_continuation_token(continuation_token.clone())
.set_max_keys(request_max_keys);
// min of two Options, returning Some if one is value and another is
// None (None is smaller than anything, so plain min doesn't work).
let request_max_keys = self
.max_keys_per_list_response
.into_iter()
.chain(max_keys.into_iter())
.min();
let mut request = self
.client
.list_objects_v2()
.bucket(self.bucket_name.clone())
.set_prefix(list_prefix.clone())
.set_continuation_token(continuation_token)
.set_max_keys(request_max_keys);
if let ListingMode::WithDelimiter = mode {
request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
}
let request = request.send();
let response = tokio::select! {
res = request => res,
_ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
_ = cancel.cancelled() => return Err(DownloadError::Cancelled),
};
let response = response
.context("Failed to list S3 prefixes")
.map_err(DownloadError::Other);
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &response, started_at);
let response = response?;
let keys = response.contents();
let empty = Vec::new();
let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
for object in keys {
let object_path = object.key().expect("response does not contain a key");
let remote_path = self.s3_object_to_relative_path(object_path);
result.keys.push(remote_path);
if let Some(mut mk) = max_keys {
assert!(mk > 0);
mk -= 1;
if mk == 0 {
return Ok(result); // limit reached
}
max_keys = Some(mk);
if let ListingMode::WithDelimiter = mode {
request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
}
let request = request.send();
let response = tokio::select! {
res = request => Ok(res),
_ = tokio::time::sleep(self.timeout) => Err(DownloadError::Timeout),
_ = cancel.cancelled() => Err(DownloadError::Cancelled),
}?;
let response = response
.context("Failed to list S3 prefixes")
.map_err(DownloadError::Other);
let started_at = ScopeGuard::into_inner(started_at);
crate::metrics::BUCKET_METRICS
.req_seconds
.observe_elapsed(kind, &response, started_at);
let response = match response {
Ok(response) => response,
Err(e) => {
// The error is potentially retryable, so we must rewind the loop after yielding.
yield Err(e);
continue 'outer;
},
};
let keys = response.contents();
let prefixes = response.common_prefixes.as_deref().unwrap_or_default();
tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
let mut result = Listing::default();
for object in keys {
let object_path = object.key().expect("response does not contain a key");
let remote_path = self.s3_object_to_relative_path(object_path);
result.keys.push(remote_path);
if let Some(mut mk) = max_keys {
assert!(mk > 0);
mk -= 1;
if mk == 0 {
// limit reached
yield Ok(result);
break 'outer;
}
max_keys = Some(mk);
}
}
// S3 gives us prefixes like "foo/", we return them like "foo"
result.prefixes.extend(prefixes.iter().filter_map(|o| {
Some(
self.s3_object_to_relative_path(
o.prefix()?
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
),
)
}));
yield Ok(result);
continuation_token = match response.next_continuation_token {
Some(new_token) => Some(new_token),
None => break,
};
}
// S3 gives us prefixes like "foo/", we return them like "foo"
result.prefixes.extend(prefixes.iter().filter_map(|o| {
Some(
self.s3_object_to_relative_path(
o.prefix()?
.trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
),
)
}));
continuation_token = match response.next_continuation_token {
Some(new_token) => Some(new_token),
None => break,
};
}
Ok(result)
}
async fn upload(
@@ -1041,8 +1035,8 @@ mod tests {
use crate::{RemotePath, S3Bucket, S3Config};
#[test]
fn relative_path() {
#[tokio::test]
async fn relative_path() {
let all_paths = ["", "some/path", "some/path/"];
let all_paths: Vec<RemotePath> = all_paths
.iter()
@@ -1085,8 +1079,9 @@ mod tests {
max_keys_per_list_response: Some(5),
upload_storage_class: None,
};
let storage =
S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
let storage = S3Bucket::new(&config, std::time::Duration::ZERO)
.await
.expect("remote storage init");
for (test_path_idx, test_path) in all_paths.iter().enumerate() {
let result = storage.relative_path_to_s3_object(test_path);
let expected = expected_outputs[prefix_idx][test_path_idx];

View File

@@ -3,6 +3,7 @@
//! testing purposes.
use bytes::Bytes;
use futures::stream::Stream;
use futures::StreamExt;
use std::collections::HashMap;
use std::num::NonZeroU32;
use std::sync::Mutex;
@@ -107,6 +108,23 @@ impl UnreliableWrapper {
type VoidStorage = crate::LocalFs;
impl RemoteStorage for UnreliableWrapper {
fn list_streaming(
&self,
prefix: Option<&RemotePath>,
mode: ListingMode,
max_keys: Option<NonZeroU32>,
cancel: &CancellationToken,
) -> impl Stream<Item = Result<Listing, DownloadError>> {
async_stream::stream! {
self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
.map_err(DownloadError::Other)?;
let mut stream = self.inner
.list_streaming(prefix, mode, max_keys, cancel);
while let Some(item) = stream.next().await {
yield item;
}
}
}
async fn list(
&self,
prefix: Option<&RemotePath>,

View File

@@ -152,7 +152,7 @@ pub(crate) async fn upload_remote_data(
let mut upload_tasks = JoinSet::new();
let cancel = CancellationToken::new();
for i in 1..upload_tasks_count + 1 {
for i in 1..=upload_tasks_count {
let task_client = Arc::clone(client);
let cancel = cancel.clone();

View File

@@ -1,5 +1,6 @@
use anyhow::Context;
use camino::Utf8Path;
use futures::StreamExt;
use remote_storage::ListingMode;
use remote_storage::RemotePath;
use std::sync::Arc;
@@ -29,10 +30,10 @@ use super::{
/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
///
/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
/// since current default AWS S3 pagination limit is 1000.
/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
/// In the `MaybeEnabledStorageWithTestBlobs::setup`, we set the `max_keys_in_list_response` param to limit the keys in a single response.
/// This way, we are able to test the pagination, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
/// as the current default AWS S3 pagination limit is 1000.
/// (see <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>).
///
/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
@@ -87,6 +88,41 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
"remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
);
// list_streaming
let prefix_with_slash = base_prefix.add_trailing_slash();
let mut nested_remote_prefixes_st = test_client.list_streaming(
Some(&prefix_with_slash),
ListingMode::WithDelimiter,
None,
&cancel,
);
let mut nested_remote_prefixes_combined = HashSet::new();
let mut segments = 0;
let mut segment_max_size = 0;
while let Some(st) = nested_remote_prefixes_st.next().await {
let st = st?;
segment_max_size = segment_max_size.max(st.prefixes.len());
nested_remote_prefixes_combined.extend(st.prefixes.into_iter());
segments += 1;
}
assert!(segments > 1, "less than 2 segments: {segments}");
assert!(
segment_max_size * 2 <= nested_remote_prefixes_combined.len(),
"double of segment_max_size={segment_max_size} larger number of remote prefixes of {}",
nested_remote_prefixes_combined.len()
);
let remote_only_prefixes = nested_remote_prefixes_combined
.difference(&expected_remote_prefixes)
.collect::<HashSet<_>>();
let missing_uploaded_prefixes = expected_remote_prefixes
.difference(&nested_remote_prefixes_combined)
.collect::<HashSet<_>>();
assert_eq!(
remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
"remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
);
Ok(())
}

View File

@@ -31,6 +31,7 @@ struct EnabledAzure {
impl EnabledAzure {
async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
let client = create_azure_client(max_keys_in_list_response)
.await
.context("Azure client creation")
.expect("Azure client creation failed");
@@ -187,7 +188,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
}
}
fn create_azure_client(
async fn create_azure_client(
max_keys_per_list_response: Option<i32>,
) -> anyhow::Result<Arc<GenericRemoteStorage>> {
use rand::Rng;
@@ -221,6 +222,8 @@ fn create_azure_client(
timeout: Duration::from_secs(120),
};
Ok(Arc::new(
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
GenericRemoteStorage::from_config(&remote_storage_config)
.await
.context("remote storage init")?,
))
}

View File

@@ -197,6 +197,7 @@ struct EnabledS3 {
impl EnabledS3 {
async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
let client = create_s3_client(max_keys_in_list_response)
.await
.context("S3 client creation")
.expect("S3 client creation failed");
@@ -352,7 +353,7 @@ impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
}
}
fn create_s3_client(
async fn create_s3_client(
max_keys_per_list_response: Option<i32>,
) -> anyhow::Result<Arc<GenericRemoteStorage>> {
use rand::Rng;
@@ -385,7 +386,9 @@ fn create_s3_client(
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
};
Ok(Arc::new(
GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
GenericRemoteStorage::from_config(&remote_storage_config)
.await
.context("remote storage init")?,
))
}

View File

@@ -20,7 +20,6 @@ bincode.workspace = true
bytes.workspace = true
camino.workspace = true
chrono.workspace = true
heapless.workspace = true
hex = { workspace = true, features = ["serde"] }
humantime.workspace = true
hyper = { workspace = true, features = ["full"] }

View File

@@ -33,6 +33,10 @@ pub enum Scope {
GenerationsApi,
// Allows access to control plane managment API and some storage controller endpoints.
Admin,
/// Allows access to storage controller APIs used by the scrubber, to interrogate the state
/// of a tenant & post scrub results.
Scrubber,
}
/// JWT payload. See docs/authentication.md for the format

View File

@@ -0,0 +1,114 @@
use std::{
fmt::Display,
time::{Duration, Instant},
};
use metrics::IntCounter;
/// Circuit breakers are for operations that are expensive and fallible: if they fail repeatedly,
/// we will stop attempting them for some period of time, to avoid denial-of-service from retries, and
/// to mitigate the log spam from repeated failures.
pub struct CircuitBreaker {
/// An identifier that enables us to log useful errors when a circuit is broken
name: String,
/// Consecutive failures since last success
fail_count: usize,
/// How many consecutive failures before we break the circuit
fail_threshold: usize,
/// If circuit is broken, when was it broken?
broken_at: Option<Instant>,
/// If set, we will auto-reset the circuit this long after it was broken. If None, broken
/// circuits stay broken forever, or until success() is called.
reset_period: Option<Duration>,
/// If this is true, no actual circuit-breaking happens. This is for overriding a circuit breaker
/// to permit something to keep running even if it would otherwise have tripped it.
short_circuit: bool,
}
impl CircuitBreaker {
pub fn new(name: String, fail_threshold: usize, reset_period: Option<Duration>) -> Self {
Self {
name,
fail_count: 0,
fail_threshold,
broken_at: None,
reset_period,
short_circuit: false,
}
}
/// Construct an unbreakable circuit breaker, for use in unit tests etc.
pub fn short_circuit() -> Self {
Self {
name: String::new(),
fail_threshold: 0,
fail_count: 0,
broken_at: None,
reset_period: None,
short_circuit: true,
}
}
pub fn fail<E>(&mut self, metric: &IntCounter, error: E)
where
E: Display,
{
if self.short_circuit {
return;
}
self.fail_count += 1;
if self.broken_at.is_none() && self.fail_count >= self.fail_threshold {
self.break_circuit(metric, error);
}
}
/// Call this after successfully executing an operation
pub fn success(&mut self, metric: &IntCounter) {
self.fail_count = 0;
if let Some(broken_at) = &self.broken_at {
tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})",
humantime::format_duration(broken_at.elapsed()));
self.broken_at = None;
metric.inc();
}
}
/// Call this before attempting an operation, and skip the operation if we are currently broken.
pub fn is_broken(&mut self) -> bool {
if self.short_circuit {
return false;
}
if let Some(broken_at) = self.broken_at {
match self.reset_period {
Some(reset_period) if broken_at.elapsed() > reset_period => {
self.reset_circuit();
false
}
_ => true,
}
} else {
false
}
}
fn break_circuit<E>(&mut self, metric: &IntCounter, error: E)
where
E: Display,
{
self.broken_at = Some(Instant::now());
tracing::error!(breaker=%self.name, "Circuit breaker broken! Last error: {error}");
metric.inc();
}
fn reset_circuit(&mut self) {
self.broken_at = None;
self.fail_count = 0;
}
}

View File

@@ -1,196 +0,0 @@
//! A heapless buffer for events of sorts.
use std::ops;
use heapless::HistoryBuffer;
#[derive(Debug, Clone)]
pub struct HistoryBufferWithDropCounter<T, const L: usize> {
buffer: HistoryBuffer<T, L>,
drop_count: u64,
}
impl<T, const L: usize> HistoryBufferWithDropCounter<T, L> {
pub fn write(&mut self, data: T) {
let len_before = self.buffer.len();
self.buffer.write(data);
let len_after = self.buffer.len();
self.drop_count += u64::from(len_before == len_after);
}
pub fn drop_count(&self) -> u64 {
self.drop_count
}
pub fn map<U, F: Fn(&T) -> U>(&self, f: F) -> HistoryBufferWithDropCounter<U, L> {
let mut buffer = HistoryBuffer::new();
buffer.extend(self.buffer.oldest_ordered().map(f));
HistoryBufferWithDropCounter::<U, L> {
buffer,
drop_count: self.drop_count,
}
}
}
impl<T, const L: usize> Default for HistoryBufferWithDropCounter<T, L> {
fn default() -> Self {
Self {
buffer: HistoryBuffer::default(),
drop_count: 0,
}
}
}
impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
type Target = HistoryBuffer<T, L>;
fn deref(&self) -> &Self::Target {
&self.buffer
}
}
#[derive(serde::Serialize, serde::Deserialize)]
struct SerdeRepr<T> {
buffer: Vec<T>,
buffer_size: usize,
drop_count: u64,
}
impl<'a, T, const L: usize> From<&'a HistoryBufferWithDropCounter<T, L>> for SerdeRepr<T>
where
T: Clone + serde::Serialize,
{
fn from(value: &'a HistoryBufferWithDropCounter<T, L>) -> Self {
let HistoryBufferWithDropCounter { buffer, drop_count } = value;
SerdeRepr {
buffer: buffer.iter().cloned().collect(),
buffer_size: L,
drop_count: *drop_count,
}
}
}
impl<T, const L: usize> serde::Serialize for HistoryBufferWithDropCounter<T, L>
where
T: Clone + serde::Serialize,
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
SerdeRepr::from(self).serialize(serializer)
}
}
impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
where
T: Clone + serde::Deserialize<'de>,
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let SerdeRepr {
buffer: des_buffer,
drop_count,
buffer_size,
} = SerdeRepr::<T>::deserialize(deserializer)?;
if buffer_size != L {
use serde::de::Error;
return Err(D::Error::custom(format!(
"invalid buffer_size, expecting {L} got {buffer_size}"
)));
}
let mut buffer = HistoryBuffer::new();
buffer.extend(des_buffer);
Ok(HistoryBufferWithDropCounter { buffer, drop_count })
}
}
#[cfg(test)]
mod test {
use super::HistoryBufferWithDropCounter;
#[test]
fn test_basics() {
let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
b.write(1);
b.write(2);
b.write(3);
assert!(b.iter().any(|e| *e == 2));
assert!(b.iter().any(|e| *e == 3));
assert!(!b.iter().any(|e| *e == 1));
// round-trip serde
let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
assert_eq!(
round_tripped.iter().cloned().collect::<Vec<_>>(),
b.iter().cloned().collect::<Vec<_>>()
);
}
#[test]
fn test_drop_count_works() {
let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
b.write(1);
assert_eq!(b.drop_count(), 0);
b.write(2);
assert_eq!(b.drop_count(), 0);
b.write(3);
assert_eq!(b.drop_count(), 1);
b.write(4);
assert_eq!(b.drop_count(), 2);
}
#[test]
fn test_clone_works() {
let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
b.write(1);
b.write(2);
b.write(3);
assert_eq!(b.drop_count(), 1);
let mut c = b.clone();
assert_eq!(c.drop_count(), 1);
assert!(c.iter().any(|e| *e == 2));
assert!(c.iter().any(|e| *e == 3));
assert!(!c.iter().any(|e| *e == 1));
c.write(4);
assert!(c.iter().any(|e| *e == 4));
assert!(!b.iter().any(|e| *e == 4));
}
#[test]
fn test_map() {
let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
b.write(1);
assert_eq!(b.drop_count(), 0);
{
let c = b.map(|i| i + 10);
assert_eq!(c.oldest_ordered().cloned().collect::<Vec<_>>(), vec![11]);
assert_eq!(c.drop_count(), 0);
}
b.write(2);
assert_eq!(b.drop_count(), 0);
{
let c = b.map(|i| i + 10);
assert_eq!(
c.oldest_ordered().cloned().collect::<Vec<_>>(),
vec![11, 12]
);
assert_eq!(c.drop_count(), 0);
}
b.write(3);
assert_eq!(b.drop_count(), 1);
{
let c = b.map(|i| i + 10);
assert_eq!(
c.oldest_ordered().cloned().collect::<Vec<_>>(),
vec![12, 13]
);
assert_eq!(c.drop_count(), 1);
}
}
}

View File

@@ -52,17 +52,17 @@ struct RequestId(String);
/// There could be other ways to implement similar functionality:
///
/// * procmacros placed on top of all handler methods
/// With all the drawbacks of procmacros, brings no difference implementation-wise,
/// and little code reduction compared to the existing approach.
/// With all the drawbacks of procmacros, brings no difference implementation-wise,
/// and little code reduction compared to the existing approach.
///
/// * Another `TraitExt` with e.g. the `get_with_span`, `post_with_span` methods to do similar logic,
/// implemented for [`RouterBuilder`].
/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
/// implemented for [`RouterBuilder`].
/// Could be simpler, but we don't want to depend on [`routerify`] more, targeting to use other library later.
///
/// * In theory, a span guard could've been created in a pre-request middleware and placed into a global collection, to be dropped
/// later, in a post-response middleware.
/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
/// tries to achive with its `.instrument` used in the current approach.
/// later, in a post-response middleware.
/// Due to suspendable nature of the futures, would give contradictive results which is exactly the opposite of what `tracing-futures`
/// tries to achive with its `.instrument` used in the current approach.
///
/// If needed, a declarative macro to substitute the |r| ... closure boilerplate could be introduced.
pub async fn request_span<R, H>(request: Request<Body>, handler: H) -> R::Output

View File

@@ -302,17 +302,6 @@ pub struct TenantId(Id);
id_newtype!(TenantId);
/// Neon Connection Id identifies long-lived connections (for example a pagestream
/// connection with the page_service). Is used for better logging and tracing
///
/// NOTE: It (de)serializes as an array of hex bytes, so the string representation would look
/// like `[173,80,132,115,129,226,72,254,170,201,135,108,199,26,228,24]`.
/// See [`Id`] for alternative ways to serialize it.
#[derive(Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, PartialOrd, Ord)]
pub struct ConnectionId(Id);
id_newtype!(ConnectionId);
// A pair uniquely identifying Neon instance.
#[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct TenantTimelineId {

View File

@@ -59,8 +59,6 @@ pub mod signals;
pub mod fs_ext;
pub mod history_buffer;
pub mod measured_stream;
pub mod serde_percent;
@@ -98,6 +96,8 @@ pub mod poison;
pub mod toml_edit_ext;
pub mod circuit_breaker;
/// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
///
/// we have several cases:

View File

@@ -49,6 +49,7 @@ pub struct TenantShardId {
impl ShardCount {
pub const MAX: Self = Self(u8::MAX);
pub const MIN: Self = Self(0);
/// The internal value of a ShardCount may be zero, which means "1 shard, but use
/// legacy format for TenantShardId that excludes the shard suffix", also known

View File

@@ -1,6 +1,7 @@
use std::collections::HashMap;
use bytes::Bytes;
use detach_ancestor::AncestorDetached;
use pageserver_api::{models::*, shard::TenantShardId};
use reqwest::{IntoUrl, Method, StatusCode};
use utils::{
@@ -418,6 +419,23 @@ impl Client {
}
}
pub async fn timeline_detach_ancestor(
&self,
tenant_shard_id: TenantShardId,
timeline_id: TimelineId,
) -> Result<AncestorDetached> {
let uri = format!(
"{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor",
self.mgmt_api_endpoint
);
self.request(Method::PUT, &uri, ())
.await?
.json()
.await
.map_err(Error::ReceiveBody)
}
pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
let uri = format!(
"{}/v1/tenant/{}/reset",

View File

@@ -131,7 +131,7 @@ impl CompactionKey for Key {
pub type CompactionKeySpace<K> = Vec<Range<K>>;
/// Functions needed from all layers.
pub trait CompactionLayer<K: CompactionKey + ?Sized> {
pub trait CompactionLayer<K: CompactionKey> {
fn key_range(&self) -> &Range<K>;
fn lsn_range(&self) -> &Range<Lsn>;

View File

@@ -179,7 +179,7 @@ async fn main() -> anyhow::Result<()> {
.get("remote_storage")
.expect("need remote_storage");
let config = RemoteStorageConfig::from_toml(toml_item)?;
let storage = remote_storage::GenericRemoteStorage::from_config(&config);
let storage = remote_storage::GenericRemoteStorage::from_config(&config).await;
let cancel = CancellationToken::new();
storage
.unwrap()

View File

@@ -14,12 +14,14 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
}
(Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
(Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
(Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
format!(
"JWT scope '{:?}' is ineligible for Pageserver auth",
claims.scope
)
.into(),
)),
(Scope::Admin | Scope::SafekeeperData | Scope::GenerationsApi | Scope::Scrubber, _) => {
Err(AuthError(
format!(
"JWT scope '{:?}' is ineligible for Pageserver auth",
claims.scope
)
.into(),
))
}
}
}

View File

@@ -2,30 +2,35 @@
//! Main entry point for the Page Server executable.
use std::env;
use std::env::{var, VarError};
use std::io::Read;
use std::sync::Arc;
use std::time::Duration;
use std::{env, ops::ControlFlow, str::FromStr};
use anyhow::{anyhow, Context};
use camino::Utf8Path;
use clap::{Arg, ArgAction, Command};
use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
use pageserver::config::PageserverIdentity;
use pageserver::control_plane_client::ControlPlaneClient;
use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
use pageserver::task_mgr::WALRECEIVER_RUNTIME;
use pageserver::tenant::{secondary, TenantSharedResources};
use pageserver::{
CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, LibpqEndpointListener,
};
use remote_storage::GenericRemoteStorage;
use tokio::signal::unix::SignalKind;
use tokio::time::Instant;
use tokio_util::sync::CancellationToken;
use tracing::*;
use metrics::set_build_info_metric;
use pageserver::{
config::{defaults::*, PageServerConf},
config::PageServerConf,
context::{DownloadBehavior, RequestContext},
deletion_queue::DeletionQueue,
http, page_cache, page_service, task_mgr,
@@ -84,18 +89,13 @@ fn main() -> anyhow::Result<()> {
.with_context(|| format!("Error opening workdir '{workdir}'"))?;
let cfg_file_path = workdir.join("pageserver.toml");
let identity_file_path = workdir.join("identity.toml");
// Set CWD to workdir for non-daemon modes
env::set_current_dir(&workdir)
.with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?;
let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
ControlFlow::Continue(conf) => conf,
ControlFlow::Break(()) => {
info!("Pageserver config init successful");
return Ok(());
}
};
let conf = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?;
// Initialize logging.
//
@@ -150,70 +150,55 @@ fn main() -> anyhow::Result<()> {
}
fn initialize_config(
identity_file_path: &Utf8Path,
cfg_file_path: &Utf8Path,
arg_matches: clap::ArgMatches,
workdir: &Utf8Path,
) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
let init = arg_matches.get_flag("init");
let file_contents: Option<toml_edit::Document> = match std::fs::File::open(cfg_file_path) {
) -> anyhow::Result<&'static PageServerConf> {
// The deployment orchestrator writes out an indentity file containing the node id
// for all pageservers. This file is the source of truth for the node id. In order
// to allow for rolling back pageserver releases, the node id is also included in
// the pageserver config that the deployment orchestrator writes to disk for the pageserver.
// A rolled back version of the pageserver will get the node id from the pageserver.toml
// config file.
let identity = match std::fs::File::open(identity_file_path) {
Ok(mut f) => {
if init {
anyhow::bail!("config file already exists: {cfg_file_path}");
let md = f.metadata().context("stat config file")?;
if !md.is_file() {
anyhow::bail!("Pageserver found identity file but it is a dir entry: {identity_file_path}. Aborting start up ...");
}
let mut s = String::new();
f.read_to_string(&mut s).context("read identity file")?;
toml_edit::de::from_str::<PageserverIdentity>(&s)?
}
Err(e) => {
anyhow::bail!("Pageserver could not read identity file: {identity_file_path}: {e}. Aborting start up ...");
}
};
let config: toml_edit::Document = match std::fs::File::open(cfg_file_path) {
Ok(mut f) => {
let md = f.metadata().context("stat config file")?;
if md.is_file() {
let mut s = String::new();
f.read_to_string(&mut s).context("read config file")?;
Some(s.parse().context("parse config file toml")?)
s.parse().context("parse config file toml")?
} else {
anyhow::bail!("directory entry exists but is not a file: {cfg_file_path}");
}
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => None,
Err(e) => {
anyhow::bail!("open pageserver config: {e}: {cfg_file_path}");
}
};
let mut effective_config = file_contents.unwrap_or_else(|| {
DEFAULT_CONFIG_FILE
.parse()
.expect("unit tests ensure this works")
});
// Patch with overrides from the command line
if let Some(values) = arg_matches.get_many::<String>("config-override") {
for option_line in values {
let doc = toml_edit::Document::from_str(option_line).with_context(|| {
format!("Option '{option_line}' could not be parsed as a toml document")
})?;
for (key, item) in doc.iter() {
effective_config.insert(key, item.clone());
}
}
}
debug!("Resulting toml: {effective_config}");
debug!("Using pageserver toml: {config}");
// Construct the runtime representation
let conf = PageServerConf::parse_and_validate(&effective_config, workdir)
let conf = PageServerConf::parse_and_validate(identity.id, &config, workdir)
.context("Failed to parse pageserver configuration")?;
if init {
info!("Writing pageserver config to '{cfg_file_path}'");
std::fs::write(cfg_file_path, effective_config.to_string())
.with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?;
info!("Config successfully written to '{cfg_file_path}'")
}
Ok(if init {
ControlFlow::Break(())
} else {
ControlFlow::Continue(Box::leak(Box::new(conf)))
})
Ok(Box::leak(Box::new(conf)))
}
struct WaitForPhaseResult<F: std::future::Future + Unpin> {
@@ -305,6 +290,7 @@ fn start_pageserver(
// Create and lock PID file. This ensures that there cannot be more than one
// pageserver process running at the same time.
let lock_file_path = conf.workdir.join(PID_FILE_NAME);
info!("Claiming pid file at {lock_file_path:?}...");
let lock_file =
utils::pid_file::claim_for_current_process(&lock_file_path).context("claim pid file")?;
info!("Claimed pid file at {lock_file_path:?}");
@@ -385,7 +371,7 @@ fn start_pageserver(
let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
// Set up remote storage client
let remote_storage = create_remote_storage_client(conf)?;
let remote_storage = BACKGROUND_RUNTIME.block_on(create_remote_storage_client(conf))?;
// Set up deletion queue
let (deletion_queue, deletion_workers) = DeletionQueue::new(
@@ -430,8 +416,10 @@ fn start_pageserver(
// Scan the local 'tenants/' directory and start loading the tenants
let deletion_queue_client = deletion_queue.new_client();
let background_purges = mgr::BackgroundPurges::default();
let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
conf,
background_purges.clone(),
TenantSharedResources {
broker_client: broker_client.clone(),
remote_storage: remote_storage.clone(),
@@ -523,7 +511,7 @@ fn start_pageserver(
}
});
let secondary_controller = secondary::spawn_tasks(
let (secondary_controller, secondary_controller_tasks) = secondary::spawn_tasks(
tenant_manager.clone(),
remote_storage.clone(),
background_jobs_barrier.clone(),
@@ -536,18 +524,19 @@ fn start_pageserver(
// been configured.
let disk_usage_eviction_state: Arc<disk_usage_eviction_task::State> = Arc::default();
launch_disk_usage_global_eviction_task(
let disk_usage_eviction_task = launch_disk_usage_global_eviction_task(
conf,
remote_storage.clone(),
disk_usage_eviction_state.clone(),
tenant_manager.clone(),
background_jobs_barrier.clone(),
)?;
);
// Start up the service to handle HTTP mgmt API request. We created the
// listener earlier already.
{
let _rt_guard = MGMT_REQUEST_RUNTIME.enter();
let http_endpoint_listener = {
let _rt_guard = MGMT_REQUEST_RUNTIME.enter(); // for hyper
let cancel = CancellationToken::new();
let router_state = Arc::new(
http::routes::State::new(
@@ -568,78 +557,44 @@ fn start_pageserver(
let service = utils::http::RouterService::new(router).unwrap();
let server = hyper::Server::from_tcp(http_listener)?
.serve(service)
.with_graceful_shutdown(task_mgr::shutdown_watcher());
.with_graceful_shutdown({
let cancel = cancel.clone();
async move { cancel.clone().cancelled().await }
});
task_mgr::spawn(
MGMT_REQUEST_RUNTIME.handle(),
TaskKind::HttpEndpointListener,
None,
None,
let task = MGMT_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
"http endpoint listener",
true,
async {
server.await?;
Ok(())
},
);
}
server,
));
HttpEndpointListener(CancellableTask { task, cancel })
};
if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
let metrics_ctx = RequestContext::todo_child(
TaskKind::MetricsCollection,
// This task itself shouldn't download anything.
// The actual size calculation does need downloads, and
// creates a child context with the right DownloadBehavior.
DownloadBehavior::Error,
);
let consumption_metrics_tasks = {
let cancel = shutdown_pageserver.child_token();
let task = crate::BACKGROUND_RUNTIME.spawn({
let tenant_manager = tenant_manager.clone();
let cancel = cancel.clone();
async move {
// first wait until background jobs are cleared to launch.
//
// this is because we only process active tenants and timelines, and the
// Timeline::get_current_logical_size will spawn the logical size calculation,
// which will not be rate-limited.
tokio::select! {
_ = cancel.cancelled() => { return; },
_ = background_jobs_barrier.wait() => {}
};
let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
task_mgr::spawn(
crate::BACKGROUND_RUNTIME.handle(),
TaskKind::MetricsCollection,
None,
None,
"consumption metrics collection",
true,
{
let tenant_manager = tenant_manager.clone();
async move {
// first wait until background jobs are cleared to launch.
//
// this is because we only process active tenants and timelines, and the
// Timeline::get_current_logical_size will spawn the logical size calculation,
// which will not be rate-limited.
let cancel = task_mgr::shutdown_token();
tokio::select! {
_ = cancel.cancelled() => { return Ok(()); },
_ = background_jobs_barrier.wait() => {}
};
pageserver::consumption_metrics::collect_metrics(
tenant_manager,
metric_collection_endpoint,
&conf.metric_collection_bucket,
conf.metric_collection_interval,
conf.cached_metric_collection_interval,
conf.synthetic_size_calculation_interval,
conf.id,
local_disk_storage,
cancel,
metrics_ctx,
)
.instrument(info_span!("metrics_collection"))
.await?;
Ok(())
}
},
);
}
pageserver::consumption_metrics::run(conf, tenant_manager, cancel).await;
}
});
ConsumptionMetricsTasks(CancellableTask { task, cancel })
};
// Spawn a task to listen for libpq connections. It will spawn further tasks
// for each connection. We created the listener earlier already.
{
let libpq_listener = {
let cancel = CancellationToken::new();
let libpq_ctx = RequestContext::todo_child(
TaskKind::LibpqEndpointListener,
// listener task shouldn't need to download anything. (We will
@@ -648,29 +603,20 @@ fn start_pageserver(
// accept connections.)
DownloadBehavior::Error,
);
task_mgr::spawn(
COMPUTE_REQUEST_RUNTIME.handle(),
TaskKind::LibpqEndpointListener,
None,
None,
"libpq endpoint listener",
true,
{
let tenant_manager = tenant_manager.clone();
async move {
page_service::libpq_listener_main(
tenant_manager,
pg_auth,
pageserver_listener,
conf.pg_auth_type,
libpq_ctx,
task_mgr::shutdown_token(),
)
.await
}
},
);
}
let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
"libpq listener",
page_service::libpq_listener_main(
tenant_manager.clone(),
pg_auth,
pageserver_listener,
conf.pg_auth_type,
libpq_ctx,
cancel.clone(),
),
));
LibpqEndpointListener(CancellableTask { task, cancel })
};
let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
@@ -696,13 +642,24 @@ fn start_pageserver(
// Right now that tree doesn't reach very far, and `task_mgr` is used instead.
// The plan is to change that over time.
shutdown_pageserver.take();
pageserver::shutdown_pageserver(&tenant_manager, deletion_queue.clone(), 0).await;
pageserver::shutdown_pageserver(
http_endpoint_listener,
libpq_listener,
consumption_metrics_tasks,
disk_usage_eviction_task,
&tenant_manager,
background_purges,
deletion_queue.clone(),
secondary_controller_tasks,
0,
)
.await;
unreachable!()
})
}
}
fn create_remote_storage_client(
async fn create_remote_storage_client(
conf: &'static PageServerConf,
) -> anyhow::Result<GenericRemoteStorage> {
let config = if let Some(config) = &conf.remote_storage_config {
@@ -712,7 +669,7 @@ fn create_remote_storage_client(
};
// Create the client
let mut remote_storage = GenericRemoteStorage::from_config(config)?;
let mut remote_storage = GenericRemoteStorage::from_config(config).await?;
// If `test_remote_failures` is non-zero, wrap the client with a
// wrapper that simulates failures.
@@ -735,28 +692,12 @@ fn cli() -> Command {
Command::new("Neon page server")
.about("Materializes WAL stream to pages and serves them to the postgres")
.version(version())
.arg(
Arg::new("init")
.long("init")
.action(ArgAction::SetTrue)
.help("Initialize pageserver with all given config overrides"),
)
.arg(
Arg::new("workdir")
.short('D')
.long("workdir")
.help("Working directory for the pageserver"),
)
// See `settings.md` for more details on the extra configuration patameters pageserver can process
.arg(
Arg::new("config-override")
.long("config-override")
.short('c')
.num_args(1)
.action(ArgAction::Append)
.help("Additional configuration overrides of the ones from the toml config file (or new ones to add there). \
Any option has to be a valid toml document, example: `-c=\"foo='hey'\"` `-c=\"foo={value=1}\"`"),
)
.arg(
Arg::new("enabled-features")
.long("enabled-features")

View File

@@ -7,12 +7,11 @@
use anyhow::{anyhow, bail, ensure, Context, Result};
use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
use remote_storage::{RemotePath, RemoteStorageConfig};
use serde;
use serde::de::IntoDeserializer;
use serde::{self, Deserialize};
use std::env;
use storage_broker::Uri;
use utils::crashsafe::path_with_suffix_extension;
use utils::id::ConnectionId;
use utils::logging::SecretString;
use once_cell::sync::OnceCell;
@@ -69,7 +68,6 @@ pub mod defaults {
super::ConfigurableSemaphore::DEFAULT_INITIAL.get();
pub const DEFAULT_METRIC_COLLECTION_INTERVAL: &str = "10 min";
pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "0s";
pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
@@ -124,7 +122,6 @@ pub mod defaults {
#concurrent_tenant_warmup = '{DEFAULT_CONCURRENT_TENANT_WARMUP}'
#metric_collection_interval = '{DEFAULT_METRIC_COLLECTION_INTERVAL}'
#cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
#synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
#disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
@@ -239,7 +236,6 @@ pub struct PageServerConf {
// How often to collect metrics and send them to the metrics endpoint.
pub metric_collection_interval: Duration,
// How often to send unchanged cached metrics to the metrics endpoint.
pub cached_metric_collection_interval: Duration,
pub metric_collection_endpoint: Option<Url>,
pub metric_collection_bucket: Option<RemoteStorageConfig>,
pub synthetic_size_calculation_interval: Duration,
@@ -371,7 +367,6 @@ struct PageServerConfigBuilder {
concurrent_tenant_size_logical_size_queries: BuilderValue<NonZeroUsize>,
metric_collection_interval: BuilderValue<Duration>,
cached_metric_collection_interval: BuilderValue<Duration>,
metric_collection_endpoint: BuilderValue<Option<Url>>,
synthetic_size_calculation_interval: BuilderValue<Duration>,
metric_collection_bucket: BuilderValue<Option<RemoteStorageConfig>>,
@@ -411,6 +406,13 @@ struct PageServerConfigBuilder {
}
impl PageServerConfigBuilder {
fn new(node_id: NodeId) -> Self {
let mut this = Self::default();
this.id(node_id);
this
}
#[inline(always)]
fn default_values() -> Self {
use self::BuilderValue::*;
@@ -455,10 +457,6 @@ impl PageServerConfigBuilder {
DEFAULT_METRIC_COLLECTION_INTERVAL,
)
.expect("cannot parse default metric collection interval")),
cached_metric_collection_interval: Set(humantime::parse_duration(
DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL,
)
.expect("cannot parse default cached_metric_collection_interval")),
synthetic_size_calculation_interval: Set(humantime::parse_duration(
DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL,
)
@@ -590,14 +588,6 @@ impl PageServerConfigBuilder {
self.metric_collection_interval = BuilderValue::Set(metric_collection_interval)
}
pub fn cached_metric_collection_interval(
&mut self,
cached_metric_collection_interval: Duration,
) {
self.cached_metric_collection_interval =
BuilderValue::Set(cached_metric_collection_interval)
}
pub fn metric_collection_endpoint(&mut self, metric_collection_endpoint: Option<Url>) {
self.metric_collection_endpoint = BuilderValue::Set(metric_collection_endpoint)
}
@@ -731,7 +721,6 @@ impl PageServerConfigBuilder {
broker_keepalive_interval,
log_format,
metric_collection_interval,
cached_metric_collection_interval,
metric_collection_endpoint,
metric_collection_bucket,
synthetic_size_calculation_interval,
@@ -870,22 +859,6 @@ impl PageServerConf {
)
}
pub fn traces_path(&self) -> Utf8PathBuf {
self.workdir.join("traces")
}
pub fn trace_path(
&self,
tenant_shard_id: &TenantShardId,
timeline_id: &TimelineId,
connection_id: &ConnectionId,
) -> Utf8PathBuf {
self.traces_path()
.join(tenant_shard_id.to_string())
.join(timeline_id.to_string())
.join(connection_id.to_string())
}
/// Turns storage remote path of a file into its local path.
pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
remote_path.with_base(&self.workdir)
@@ -915,8 +888,12 @@ impl PageServerConf {
/// validating the input and failing on errors.
///
/// This leaves any options not present in the file in the built-in defaults.
pub fn parse_and_validate(toml: &Document, workdir: &Utf8Path) -> anyhow::Result<Self> {
let mut builder = PageServerConfigBuilder::default();
pub fn parse_and_validate(
node_id: NodeId,
toml: &Document,
workdir: &Utf8Path,
) -> anyhow::Result<Self> {
let mut builder = PageServerConfigBuilder::new(node_id);
builder.workdir(workdir.to_owned());
let mut t_conf = TenantConfOpt::default();
@@ -947,7 +924,8 @@ impl PageServerConf {
"tenant_config" => {
t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
}
"id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
"id" => {}, // Ignoring `id` field in pageserver.toml - using identity.toml as the source of truth
// Logging is not set up yet, so we can't do it.
"broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
"broker_keepalive_interval" => builder.broker_keepalive_interval(parse_toml_duration(key, item)?),
"log_format" => builder.log_format(
@@ -964,7 +942,6 @@ impl PageServerConf {
NonZeroUsize::new(permits).context("initial semaphore permits out of range: 0, use other configuration to disable a feature")?
}),
"metric_collection_interval" => builder.metric_collection_interval(parse_toml_duration(key, item)?),
"cached_metric_collection_interval" => builder.cached_metric_collection_interval(parse_toml_duration(key, item)?),
"metric_collection_endpoint" => {
let endpoint = parse_toml_string(key, item)?.parse().context("failed to parse metric_collection_endpoint")?;
builder.metric_collection_endpoint(Some(endpoint));
@@ -1097,7 +1074,6 @@ impl PageServerConf {
eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore::default(
),
metric_collection_interval: Duration::from_secs(60),
cached_metric_collection_interval: Duration::from_secs(60 * 60),
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
metric_collection_bucket: None,
synthetic_size_calculation_interval: Duration::from_secs(60),
@@ -1126,6 +1102,12 @@ impl PageServerConf {
}
}
#[derive(Deserialize)]
#[serde(deny_unknown_fields)]
pub struct PageserverIdentity {
pub id: NodeId,
}
// Helper functions to parse a toml Item
fn parse_toml_string(name: &str, item: &Item) -> Result<String> {
@@ -1276,7 +1258,6 @@ initial_superuser_name = 'zzzz'
id = 10
metric_collection_interval = '222 s'
cached_metric_collection_interval = '22200 s'
metric_collection_endpoint = 'http://localhost:80/metrics'
synthetic_size_calculation_interval = '333 s'
@@ -1296,7 +1277,7 @@ background_task_maximum_delay = '334 s'
);
let toml = config_string.parse()?;
let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
.unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
assert_eq!(
@@ -1332,9 +1313,6 @@ background_task_maximum_delay = '334 s'
metric_collection_interval: humantime::parse_duration(
defaults::DEFAULT_METRIC_COLLECTION_INTERVAL
)?,
cached_metric_collection_interval: humantime::parse_duration(
defaults::DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL
)?,
metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
metric_collection_bucket: None,
synthetic_size_calculation_interval: humantime::parse_duration(
@@ -1381,7 +1359,7 @@ background_task_maximum_delay = '334 s'
);
let toml = config_string.parse()?;
let parsed_config = PageServerConf::parse_and_validate(&toml, &workdir)
let parsed_config = PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
.unwrap_or_else(|e| panic!("Failed to parse config '{config_string}', reason: {e:?}"));
assert_eq!(
@@ -1413,7 +1391,6 @@ background_task_maximum_delay = '334 s'
eviction_task_immitated_concurrent_logical_size_queries:
ConfigurableSemaphore::default(),
metric_collection_interval: Duration::from_secs(222),
cached_metric_collection_interval: Duration::from_secs(22200),
metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
metric_collection_bucket: None,
synthetic_size_calculation_interval: Duration::from_secs(333),
@@ -1472,12 +1449,13 @@ broker_endpoint = '{broker_endpoint}'
let toml = config_string.parse()?;
let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
.unwrap_or_else(|e| {
panic!("Failed to parse config '{config_string}', reason: {e:?}")
})
.remote_storage_config
.expect("Should have remote storage config for the local FS");
let parsed_remote_storage_config =
PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
.unwrap_or_else(|e| {
panic!("Failed to parse config '{config_string}', reason: {e:?}")
})
.remote_storage_config
.expect("Should have remote storage config for the local FS");
assert_eq!(
parsed_remote_storage_config,
@@ -1533,12 +1511,13 @@ broker_endpoint = '{broker_endpoint}'
let toml = config_string.parse()?;
let parsed_remote_storage_config = PageServerConf::parse_and_validate(&toml, &workdir)
.unwrap_or_else(|e| {
panic!("Failed to parse config '{config_string}', reason: {e:?}")
})
.remote_storage_config
.expect("Should have remote storage config for S3");
let parsed_remote_storage_config =
PageServerConf::parse_and_validate(NodeId(10), &toml, &workdir)
.unwrap_or_else(|e| {
panic!("Failed to parse config '{config_string}', reason: {e:?}")
})
.remote_storage_config
.expect("Should have remote storage config for S3");
assert_eq!(
parsed_remote_storage_config,
@@ -1560,34 +1539,6 @@ broker_endpoint = '{broker_endpoint}'
Ok(())
}
#[test]
fn parse_tenant_config() -> anyhow::Result<()> {
let tempdir = tempdir()?;
let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;
let broker_endpoint = "http://127.0.0.1:7777";
let trace_read_requests = true;
let config_string = format!(
r#"{ALL_BASE_VALUES_TOML}
pg_distrib_dir='{pg_distrib_dir}'
broker_endpoint = '{broker_endpoint}'
[tenant_config]
trace_read_requests = {trace_read_requests}"#,
);
let toml = config_string.parse()?;
let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
assert_eq!(
conf.default_tenant_conf.trace_read_requests, trace_read_requests,
"Tenant config from pageserver config file should be parsed and udpated values used as defaults for all tenants",
);
Ok(())
}
#[test]
fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
let config_string = r#"
@@ -1645,7 +1596,7 @@ threshold = "20m"
"#,
);
let toml: Document = pageserver_conf_toml.parse()?;
let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
let conf = PageServerConf::parse_and_validate(NodeId(333), &toml, &workdir)?;
assert_eq!(conf.pg_distrib_dir, pg_distrib_dir);
assert_eq!(
@@ -1661,7 +1612,11 @@ threshold = "20m"
.evictions_low_residence_duration_metric_threshold,
Duration::from_secs(20 * 60)
);
assert_eq!(conf.id, NodeId(222));
// Assert that the node id provided by the indentity file (threaded
// through the call to [`PageServerConf::parse_and_validate`] is
// used.
assert_eq!(conf.id, NodeId(333));
assert_eq!(
conf.disk_usage_based_eviction,
Some(DiskUsageEvictionTaskConfig {
@@ -1670,7 +1625,7 @@ threshold = "20m"
period: Duration::from_secs(10),
#[cfg(feature = "testing")]
mock_statvfs: None,
eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
eviction_order: Default::default(),
})
);
@@ -1706,7 +1661,7 @@ threshold = "20m"
"#,
);
let toml: Document = pageserver_conf_toml.parse().unwrap();
let conf = PageServerConf::parse_and_validate(&toml, &workdir).unwrap();
let conf = PageServerConf::parse_and_validate(NodeId(222), &toml, &workdir).unwrap();
match &conf.default_tenant_conf.eviction_policy {
EvictionPolicy::OnlyImitiate(t) => {
@@ -1725,7 +1680,7 @@ threshold = "20m"
remote_storage = {}
"#;
let doc = toml_edit::Document::from_str(input).unwrap();
let err = PageServerConf::parse_and_validate(&doc, &workdir)
let err = PageServerConf::parse_and_validate(NodeId(222), &doc, &workdir)
.expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
assert!(format!("{err}").contains("remote_storage"), "{err}");
}

View File

@@ -1,5 +1,6 @@
//! Periodically collect consumption metrics for all active tenants
//! and push them to a HTTP endpoint.
use crate::config::PageServerConf;
use crate::context::{DownloadBehavior, RequestContext};
use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::size::CalculateSyntheticSizeError;
@@ -39,56 +40,74 @@ type RawMetric = (MetricsKey, (EventType, u64));
/// for deduplication, but that is no longer needed.
type Cache = HashMap<MetricsKey, (EventType, u64)>;
pub async fn run(
conf: &'static PageServerConf,
tenant_manager: Arc<TenantManager>,
cancel: CancellationToken,
) {
let Some(metric_collection_endpoint) = conf.metric_collection_endpoint.as_ref() else {
return;
};
let local_disk_storage = conf.workdir.join("last_consumption_metrics.json");
let metrics_ctx = RequestContext::todo_child(
TaskKind::MetricsCollection,
// This task itself shouldn't download anything.
// The actual size calculation does need downloads, and
// creates a child context with the right DownloadBehavior.
DownloadBehavior::Error,
);
let collect_metrics = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
"consumption metrics collection",
collect_metrics(
tenant_manager.clone(),
metric_collection_endpoint,
&conf.metric_collection_bucket,
conf.metric_collection_interval,
conf.id,
local_disk_storage,
cancel.clone(),
metrics_ctx,
)
.instrument(info_span!("metrics_collection")),
));
let worker_ctx =
RequestContext::todo_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
let synthetic_size_worker = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
"synthetic size calculation",
calculate_synthetic_size_worker(
tenant_manager.clone(),
conf.synthetic_size_calculation_interval,
cancel.clone(),
worker_ctx,
)
.instrument(info_span!("synthetic_size_worker")),
));
let (collect_metrics, synthetic_size_worker) =
futures::future::join(collect_metrics, synthetic_size_worker).await;
collect_metrics
.expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process");
synthetic_size_worker
.expect("unreachable: exit_on_panic_or_error would catch the panic and exit the process");
}
/// Main thread that serves metrics collection
#[allow(clippy::too_many_arguments)]
pub async fn collect_metrics(
async fn collect_metrics(
tenant_manager: Arc<TenantManager>,
metric_collection_endpoint: &Url,
metric_collection_bucket: &Option<RemoteStorageConfig>,
metric_collection_interval: Duration,
_cached_metric_collection_interval: Duration,
synthetic_size_calculation_interval: Duration,
node_id: NodeId,
local_disk_storage: Utf8PathBuf,
cancel: CancellationToken,
ctx: RequestContext,
) -> anyhow::Result<()> {
if _cached_metric_collection_interval != Duration::ZERO {
tracing::warn!(
"cached_metric_collection_interval is no longer used, please set it to zero."
)
}
// spin up background worker that caclulates tenant sizes
let worker_ctx =
ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::CalculateSyntheticSize,
None,
None,
"synthetic size calculation",
false,
{
let tenant_manager = tenant_manager.clone();
async move {
calculate_synthetic_size_worker(
tenant_manager,
synthetic_size_calculation_interval,
&cancel,
&worker_ctx,
)
.instrument(info_span!("synthetic_size_worker"))
.await?;
Ok(())
}
},
);
let path: Arc<Utf8PathBuf> = Arc::new(local_disk_storage);
let cancel = task_mgr::shutdown_token();
let restore_and_reschedule = restore_and_reschedule(&path, metric_collection_interval);
let mut cached_metrics = tokio::select! {
@@ -103,7 +122,7 @@ pub async fn collect_metrics(
.expect("Failed to create http client with timeout");
let bucket_client = if let Some(bucket_config) = metric_collection_bucket {
match GenericRemoteStorage::from_config(bucket_config) {
match GenericRemoteStorage::from_config(bucket_config).await {
Ok(client) => Some(client),
Err(e) => {
// Non-fatal error: if we were given an invalid config, we will proceed
@@ -175,11 +194,9 @@ pub async fn collect_metrics(
BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
);
let res = tokio::time::timeout_at(
started_at + metric_collection_interval,
task_mgr::shutdown_token().cancelled(),
)
.await;
let res =
tokio::time::timeout_at(started_at + metric_collection_interval, cancel.cancelled())
.await;
if res.is_ok() {
return Ok(());
}
@@ -279,8 +296,8 @@ async fn reschedule(
async fn calculate_synthetic_size_worker(
tenant_manager: Arc<TenantManager>,
synthetic_size_calculation_interval: Duration,
cancel: &CancellationToken,
ctx: &RequestContext,
cancel: CancellationToken,
ctx: RequestContext,
) -> anyhow::Result<()> {
info!("starting calculate_synthetic_size_worker");
scopeguard::defer! {
@@ -320,7 +337,7 @@ async fn calculate_synthetic_size_worker(
// there is never any reason to exit calculate_synthetic_size_worker following any
// return value -- we don't need to care about shutdown because no tenant is found when
// pageserver is shut down.
calculate_and_log(&tenant, cancel, ctx).await;
calculate_and_log(&tenant, &cancel, &ctx).await;
}
crate::tenant::tasks::warn_when_period_overrun(

View File

@@ -59,6 +59,7 @@
//! 1. It should be easy to forward the context to callees.
//! 2. To propagate more data from high-level to low-level code, the functions in
//! the middle should not need to be modified.
//!
//! The solution is to have a container structure ([`RequestContext`]) that
//! carries the information. Functions that don't care about what's in it
//! pass it along to callees.

View File

@@ -828,9 +828,9 @@ mod test {
}
}
fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
async fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
let test_name = Box::leak(Box::new(format!("deletion_queue__{test_name}")));
let harness = TenantHarness::create(test_name)?;
let harness = TenantHarness::create(test_name).await?;
// We do not load() the harness: we only need its config and remote_storage
@@ -844,7 +844,9 @@ mod test {
},
timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
};
let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
let storage = GenericRemoteStorage::from_config(&storage_config)
.await
.unwrap();
let mock_control_plane = MockControlPlane::new();
@@ -922,7 +924,9 @@ mod test {
#[tokio::test]
async fn deletion_queue_smoke() -> anyhow::Result<()> {
// Basic test that the deletion queue processes the deletions we pass into it
let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
let ctx = setup("deletion_queue_smoke")
.await
.expect("Failed test setup");
let client = ctx.deletion_queue.new_client();
client.recover(HashMap::new())?;
@@ -992,7 +996,9 @@ mod test {
#[tokio::test]
async fn deletion_queue_validation() -> anyhow::Result<()> {
let ctx = setup("deletion_queue_validation").expect("Failed test setup");
let ctx = setup("deletion_queue_validation")
.await
.expect("Failed test setup");
let client = ctx.deletion_queue.new_client();
client.recover(HashMap::new())?;
@@ -1051,7 +1057,9 @@ mod test {
#[tokio::test]
async fn deletion_queue_recovery() -> anyhow::Result<()> {
// Basic test that the deletion queue processes the deletions we pass into it
let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
let mut ctx = setup("deletion_queue_recovery")
.await
.expect("Failed test setup");
let client = ctx.deletion_queue.new_client();
client.recover(HashMap::new())?;

View File

@@ -59,13 +59,14 @@ use utils::{completion, id::TimelineId};
use crate::{
config::PageServerConf,
metrics::disk_usage_based_eviction::METRICS,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
task_mgr::{self, BACKGROUND_RUNTIME},
tenant::{
mgr::TenantManager,
remote_timeline_client::LayerFileMetadata,
secondary::SecondaryTenant,
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
},
CancellableTask, DiskUsageEvictionTask,
};
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -83,17 +84,9 @@ pub struct DiskUsageEvictionTaskConfig {
/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
/// partitioning.
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(tag = "type", content = "args")]
pub enum EvictionOrder {
/// Order the layers to be evicted by how recently they have been accessed in absolute
/// time.
///
/// This strategy is unfair when some tenants grow faster than others towards the slower
/// growing.
#[default]
AbsoluteAccessed,
/// Order the layers to be evicted by how recently they have been accessed relatively within
/// the set of resident layers of a tenant.
RelativeAccessed {
@@ -108,6 +101,14 @@ pub enum EvictionOrder {
},
}
impl Default for EvictionOrder {
fn default() -> Self {
Self::RelativeAccessed {
highest_layer_count_loses_first: true,
}
}
}
fn default_highest_layer_count_loses_first() -> bool {
true
}
@@ -117,11 +118,6 @@ impl EvictionOrder {
use EvictionOrder::*;
match self {
AbsoluteAccessed => {
candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.last_activity_ts)
});
}
RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.relative_last_activity)
}),
@@ -134,7 +130,6 @@ impl EvictionOrder {
use EvictionOrder::*;
match self {
AbsoluteAccessed => finite_f32::FiniteF32::ZERO,
RelativeAccessed {
highest_layer_count_loses_first,
} => {
@@ -192,36 +187,34 @@ pub fn launch_disk_usage_global_eviction_task(
state: Arc<State>,
tenant_manager: Arc<TenantManager>,
background_jobs_barrier: completion::Barrier,
) -> anyhow::Result<()> {
) -> Option<DiskUsageEvictionTask> {
let Some(task_config) = &conf.disk_usage_based_eviction else {
info!("disk usage based eviction task not configured");
return Ok(());
return None;
};
info!("launching disk usage based eviction task");
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::DiskUsageEviction,
None,
None,
let cancel = CancellationToken::new();
let task = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
"disk usage based eviction",
false,
async move {
let cancel = task_mgr::shutdown_token();
{
let cancel = cancel.clone();
async move {
// wait until initial load is complete, because we cannot evict from loading tenants.
tokio::select! {
_ = cancel.cancelled() => { return anyhow::Ok(()); },
_ = background_jobs_barrier.wait() => { }
};
// wait until initial load is complete, because we cannot evict from loading tenants.
tokio::select! {
_ = cancel.cancelled() => { return Ok(()); },
_ = background_jobs_barrier.wait() => { }
};
disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await;
Ok(())
disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel)
.await;
anyhow::Ok(())
}
},
);
));
Ok(())
Some(DiskUsageEvictionTask(CancellableTask { cancel, task }))
}
#[instrument(skip_all)]

View File

@@ -377,7 +377,7 @@ paths:
schema:
$ref: "#/components/schemas/ConflictError"
/v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
/v1/tenant/{tenant_id}/timeline/{timeline_id}/preserve_initdb_archive:
parameters:
- name: tenant_id
in: path
@@ -397,6 +397,51 @@ paths:
"202":
description: Tenant scheduled to load successfully
/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/archival_config:
parameters:
- name: tenant_shard_id
in: path
required: true
schema:
type: string
- name: timeline_id
in: path
required: true
schema:
type: string
put:
description: |
Either archives or unarchives the given timeline.
An archived timeline may not have any non-archived children.
requestBody:
required: true
content:
application/json:
schema:
$ref: "#/components/schemas/ArchivalConfigRequest"
responses:
"200":
description: Timeline (un)archived successfully
"409":
description: |
The tenant/timeline is already being modified, perhaps by a concurrent call to this API
content:
application/json:
schema:
$ref: "#/components/schemas/ConflictError"
"500":
description: Generic operation error
content:
application/json:
schema:
$ref: "#/components/schemas/Error"
"503":
description: Temporarily unavailable, please retry.
content:
application/json:
schema:
$ref: "#/components/schemas/ServiceUnavailableError"
/v1/tenant/{tenant_id}/synthetic_size:
parameters:
- name: tenant_id
@@ -429,7 +474,9 @@ paths:
schema:
$ref: "#/components/schemas/SyntheticSizeResponse"
text/html:
description: SVG representation of the tenant and it's timelines.
schema:
type: string
description: SVG representation of the tenant and its timelines.
"401":
description: Unauthorized Error
content:
@@ -568,7 +615,7 @@ paths:
type: string
- name: timeline_id
in: path
ŕequired: true
required: true
schema:
type: string
@@ -774,15 +821,13 @@ components:
TenantCreateRequest:
allOf:
- $ref: '#/components/schemas/TenantConfig'
- $ref: '#/components/schemas/TenantLoadRequest'
- type: object
required:
- new_tenant_id
properties:
new_tenant_id:
type: string
generation:
type: integer
description: Attachment generation number.
TenantLoadRequest:
type: object
properties:
@@ -846,6 +891,15 @@ components:
warm:
type: boolean
description: Whether to poll remote storage for layers to download. If false, secondary locations don't download anything.
ArchivalConfigRequest:
type: object
required
- state
properties:
state:
description: The archival state of a timeline
type: string
enum: ["Archived", "Unarchived"]
TenantConfig:
type: object
properties:
@@ -873,8 +927,6 @@ components:
type: string
max_lsn_wal_lag:
type: integer
trace_read_requests:
type: boolean
heatmap_period:
type: string
TenantConfigResponse:
@@ -1108,7 +1160,7 @@ components:
reparented_timelines:
type: array
description: Set of reparented timeline ids
properties:
items:
type: string
format: hex
description: TimelineId

View File

@@ -18,14 +18,17 @@ use hyper::StatusCode;
use hyper::{Body, Request, Response, Uri};
use metrics::launch_timestamp::LaunchTimestamp;
use pageserver_api::models::AuxFilePolicy;
use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
use pageserver_api::models::IngestAuxFilesRequest;
use pageserver_api::models::ListAuxFilesRequest;
use pageserver_api::models::LocationConfig;
use pageserver_api::models::LocationConfigListResponse;
use pageserver_api::models::LocationConfigMode;
use pageserver_api::models::LsnLease;
use pageserver_api::models::LsnLeaseRequest;
use pageserver_api::models::ShardParameters;
use pageserver_api::models::TenantDetails;
use pageserver_api::models::TenantLocationConfigRequest;
use pageserver_api::models::TenantLocationConfigResponse;
use pageserver_api::models::TenantScanRemoteStorageResponse;
use pageserver_api::models::TenantScanRemoteStorageShard;
@@ -33,12 +36,10 @@ use pageserver_api::models::TenantShardLocation;
use pageserver_api::models::TenantShardSplitRequest;
use pageserver_api::models::TenantShardSplitResponse;
use pageserver_api::models::TenantSorting;
use pageserver_api::models::TimelineArchivalConfigRequest;
use pageserver_api::models::TopTenantShardItem;
use pageserver_api::models::TopTenantShardsRequest;
use pageserver_api::models::TopTenantShardsResponse;
use pageserver_api::models::{
DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantLocationConfigRequest,
};
use pageserver_api::shard::ShardCount;
use pageserver_api::shard::TenantShardId;
use remote_storage::DownloadError;
@@ -664,6 +665,39 @@ async fn timeline_preserve_initdb_handler(
json_response(StatusCode::OK, ())
}
async fn timeline_archival_config_handler(
mut request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
let request_data: TimelineArchivalConfigRequest = json_request(&mut request).await?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let state = get_state(&request);
async {
let tenant = state
.tenant_manager
.get_attached_tenant_shard(tenant_shard_id)?;
tenant
.apply_timeline_archival_config(timeline_id, request_data.state)
.await
.context("applying archival config")
.map_err(ApiError::InternalServerError)?;
Ok::<_, ApiError>(())
}
.instrument(info_span!("timeline_archival_config",
tenant_id = %tenant_shard_id.tenant_id,
shard_id = %tenant_shard_id.shard_slug(),
state = ?request_data.state,
%timeline_id))
.await?;
json_response(StatusCode::OK, ())
}
async fn timeline_detail_handler(
request: Request<Body>,
_cancel: CancellationToken,
@@ -1642,6 +1676,10 @@ async fn timeline_checkpoint_handler(
if Some(true) == parse_query_param::<_, bool>(&request, "force_image_layer_creation")? {
flags |= CompactFlags::ForceImageLayerCreation;
}
// By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload.
let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true);
let wait_until_uploaded =
parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
@@ -1658,15 +1696,17 @@ async fn timeline_checkpoint_handler(
}
})?;
timeline
.compact(&cancel, flags, &ctx)
.await
.map_err(|e|
match e {
CompactionError::ShuttingDown => ApiError::ShuttingDown,
CompactionError::Other(e) => ApiError::InternalServerError(e)
}
)?;
if compact {
timeline
.compact(&cancel, flags, &ctx)
.await
.map_err(|e|
match e {
CompactionError::ShuttingDown => ApiError::ShuttingDown,
CompactionError::Other(e) => ApiError::InternalServerError(e)
}
)?;
}
if wait_until_uploaded {
timeline.remote_client.wait_completion().await.map_err(ApiError::InternalServerError)?;
@@ -1721,7 +1761,9 @@ async fn timeline_detach_ancestor_handler(
request: Request<Body>,
_cancel: CancellationToken,
) -> Result<Response<Body>, ApiError> {
use crate::tenant::timeline::detach_ancestor::Options;
use crate::tenant::timeline::detach_ancestor;
use pageserver_api::models::detach_ancestor::AncestorDetached;
let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
check_permission(&request, Some(tenant_shard_id.tenant_id))?;
let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1729,7 +1771,7 @@ async fn timeline_detach_ancestor_handler(
let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id);
async move {
let mut options = Options::default();
let mut options = detach_ancestor::Options::default();
let rewrite_concurrency =
parse_query_param::<_, std::num::NonZeroUsize>(&request, "rewrite_concurrency")?;
@@ -1757,27 +1799,36 @@ async fn timeline_detach_ancestor_handler(
let timeline = tenant.get_timeline(timeline_id, true)?;
let (_guard, prepared) = timeline
let progress = timeline
.prepare_to_detach_from_ancestor(&tenant, options, ctx)
.await?;
let res = state
.tenant_manager
.complete_detaching_timeline_ancestor(tenant_shard_id, timeline_id, prepared, ctx)
.await;
// uncomment to allow early as possible Tenant::drop
// drop(tenant);
match res {
Ok(reparented_timelines) => {
let resp = pageserver_api::models::detach_ancestor::AncestorDetached {
let resp = match progress {
detach_ancestor::Progress::Prepared(_guard, prepared) => {
// it would be great to tag the guard on to the tenant activation future
let reparented_timelines = state
.tenant_manager
.complete_detaching_timeline_ancestor(
tenant_shard_id,
timeline_id,
prepared,
ctx,
)
.await
.context("timeline detach ancestor completion")
.map_err(ApiError::InternalServerError)?;
AncestorDetached {
reparented_timelines,
};
json_response(StatusCode::OK, resp)
}
}
Err(e) => Err(ApiError::InternalServerError(
e.context("timeline detach completion"),
)),
}
detach_ancestor::Progress::Done(resp) => resp,
};
json_response(StatusCode::OK, resp)
}
.instrument(span)
.await
@@ -2778,6 +2829,10 @@ pub fn make_router(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/preserve_initdb_archive",
|r| api_handler(r, timeline_preserve_initdb_handler),
)
.post(
"/v1/tenant/:tenant_shard_id/timeline/:timeline_id/archival_config",
|r| api_handler(r, timeline_archival_config_handler),
)
.get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
api_handler(r, timeline_detail_handler)
})

View File

@@ -13,6 +13,7 @@ pub mod http;
pub mod import_datadir;
pub mod l0_flush;
pub use pageserver_api::keyspace;
use tokio_util::sync::CancellationToken;
pub mod aux_file;
pub mod metrics;
pub mod page_cache;
@@ -23,7 +24,6 @@ pub mod span;
pub(crate) mod statvfs;
pub mod task_mgr;
pub mod tenant;
pub mod trace;
pub mod utilization;
pub mod virtual_file;
pub mod walingest;
@@ -33,7 +33,10 @@ pub mod walredo;
use crate::task_mgr::TaskKind;
use camino::Utf8Path;
use deletion_queue::DeletionQueue;
use tenant::mgr::TenantManager;
use tenant::{
mgr::{BackgroundPurges, TenantManager},
secondary,
};
use tracing::info;
/// Current storage format version
@@ -55,17 +58,39 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
pub use crate::metrics::preinitialize_metrics;
pub struct CancellableTask {
pub task: tokio::task::JoinHandle<()>,
pub cancel: CancellationToken,
}
pub struct HttpEndpointListener(pub CancellableTask);
pub struct LibpqEndpointListener(pub CancellableTask);
pub struct ConsumptionMetricsTasks(pub CancellableTask);
pub struct DiskUsageEvictionTask(pub CancellableTask);
impl CancellableTask {
pub async fn shutdown(self) {
self.cancel.cancel();
self.task.await.unwrap();
}
}
#[tracing::instrument(skip_all, fields(%exit_code))]
#[allow(clippy::too_many_arguments)]
pub async fn shutdown_pageserver(
http_listener: HttpEndpointListener,
libpq_listener: LibpqEndpointListener,
consumption_metrics_worker: ConsumptionMetricsTasks,
disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
tenant_manager: &TenantManager,
background_purges: BackgroundPurges,
mut deletion_queue: DeletionQueue,
secondary_controller_tasks: secondary::GlobalTasks,
exit_code: i32,
) {
use std::time::Duration;
// Shut down the libpq endpoint task. This prevents new connections from
// being accepted.
timed(
task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None),
libpq_listener.0.shutdown(),
"shutdown LibpqEndpointListener",
Duration::from_secs(1),
)
@@ -92,16 +117,44 @@ pub async fn shutdown_pageserver(
// Best effort to persist any outstanding deletions, to avoid leaking objects
deletion_queue.shutdown(Duration::from_secs(5)).await;
timed(
consumption_metrics_worker.0.shutdown(),
"shutdown consumption metrics",
Duration::from_secs(1),
)
.await;
timed(
futures::future::OptionFuture::from(disk_usage_eviction_task.map(|t| t.0.shutdown())),
"shutdown disk usage eviction",
Duration::from_secs(1),
)
.await;
timed(
background_purges.shutdown(),
"shutdown background purges",
Duration::from_secs(1),
)
.await;
// Shut down the HTTP endpoint last, so that you can still check the server's
// status while it's shutting down.
// FIXME: We should probably stop accepting commands like attach/detach earlier.
timed(
task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None),
http_listener.0.shutdown(),
"shutdown http",
Duration::from_secs(1),
)
.await;
timed(
secondary_controller_tasks.wait(), // cancellation happened in caller
"secondary controller wait",
Duration::from_secs(1),
)
.await;
// There should be nothing left, but let's be sure
timed(
task_mgr::shutdown_tasks(None, None, None),

View File

@@ -473,6 +473,31 @@ static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
#[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
#[strum(serialize_all = "kebab_case")]
pub(crate) enum MetricLayerKind {
Delta,
Image,
}
static TIMELINE_LAYER_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_layer_bytes",
"Sum of layer physical sizes in bytes",
&["tenant_id", "shard_id", "timeline_id", "kind"]
)
.expect("failed to define a metric")
});
static TIMELINE_LAYER_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_layer_count",
"Number of layers that exist",
&["tenant_id", "shard_id", "timeline_id", "kind"]
)
.expect("failed to define a metric")
});
static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
register_uint_gauge_vec!(
"pageserver_archive_size",
@@ -569,6 +594,38 @@ static VALID_LSN_LEASE_COUNT: Lazy<UIntGaugeVec> = Lazy::new(|| {
.expect("failed to define a metric")
});
pub(crate) static CIRCUIT_BREAKERS_BROKEN: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_circuit_breaker_broken",
"How many times a circuit breaker has broken"
)
.expect("failed to define a metric")
});
pub(crate) static CIRCUIT_BREAKERS_UNBROKEN: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_circuit_breaker_unbroken",
"How many times a circuit breaker has been un-broken (recovered)"
)
.expect("failed to define a metric")
});
pub(crate) static COMPRESSION_IMAGE_INPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_compression_image_in_bytes_total",
"Size of uncompressed data written into image layers"
)
.expect("failed to define a metric")
});
pub(crate) static COMPRESSION_IMAGE_OUTPUT_BYTES: Lazy<IntCounter> = Lazy::new(|| {
register_int_counter!(
"pageserver_compression_image_out_bytes_total",
"Size of compressed image layer written"
)
.expect("failed to define a metric")
});
pub(crate) mod initial_logical_size {
use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
use once_cell::sync::Lazy;
@@ -1474,7 +1531,6 @@ pub(crate) enum ComputeCommandKind {
Basebackup,
Fullbackup,
LeaseLsn,
Show,
}
pub(crate) struct ComputeCommandCounters {
@@ -2126,6 +2182,10 @@ pub(crate) struct TimelineMetrics {
pub last_record_gauge: IntGauge,
pub pitr_history_size: UIntGauge,
pub archival_size: UIntGauge,
pub(crate) layer_size_image: UIntGauge,
pub(crate) layer_count_image: UIntGauge,
pub(crate) layer_size_delta: UIntGauge,
pub(crate) layer_count_delta: UIntGauge,
pub standby_horizon_gauge: IntGauge,
pub resident_physical_size_gauge: UIntGauge,
/// copy of LayeredTimeline.current_logical_size
@@ -2208,6 +2268,42 @@ impl TimelineMetrics {
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
let layer_size_image = TIMELINE_LAYER_SIZE
.get_metric_with_label_values(&[
&tenant_id,
&shard_id,
&timeline_id,
MetricLayerKind::Image.into(),
])
.unwrap();
let layer_count_image = TIMELINE_LAYER_COUNT
.get_metric_with_label_values(&[
&tenant_id,
&shard_id,
&timeline_id,
MetricLayerKind::Image.into(),
])
.unwrap();
let layer_size_delta = TIMELINE_LAYER_SIZE
.get_metric_with_label_values(&[
&tenant_id,
&shard_id,
&timeline_id,
MetricLayerKind::Delta.into(),
])
.unwrap();
let layer_count_delta = TIMELINE_LAYER_COUNT
.get_metric_with_label_values(&[
&tenant_id,
&shard_id,
&timeline_id,
MetricLayerKind::Delta.into(),
])
.unwrap();
let standby_horizon_gauge = STANDBY_HORIZON
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
.unwrap();
@@ -2262,6 +2358,10 @@ impl TimelineMetrics {
last_record_gauge,
pitr_history_size,
archival_size,
layer_size_image,
layer_count_image,
layer_size_delta,
layer_count_delta,
standby_horizon_gauge,
resident_physical_size_gauge,
current_logical_size_gauge,
@@ -2323,6 +2423,31 @@ impl TimelineMetrics {
let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
MetricLayerKind::Image.into(),
]);
let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
MetricLayerKind::Image.into(),
]);
let _ = TIMELINE_LAYER_SIZE.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
MetricLayerKind::Delta.into(),
]);
let _ = TIMELINE_LAYER_COUNT.remove_label_values(&[
tenant_id,
shard_id,
timeline_id,
MetricLayerKind::Delta.into(),
]);
let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);

View File

@@ -36,7 +36,6 @@ use tokio::io::AsyncWriteExt;
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_util::sync::CancellationToken;
use tracing::*;
use utils::id::ConnectionId;
use utils::sync::gate::GateGuard;
use utils::{
auth::{Claims, Scope, SwappableJwtAuth},
@@ -66,7 +65,6 @@ use crate::tenant::GetTimelineError;
use crate::tenant::PageReconstructError;
use crate::tenant::Tenant;
use crate::tenant::Timeline;
use crate::trace::Tracer;
use pageserver_api::key::rel_block_to_key;
use pageserver_api::reltag::SlruKind;
use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
@@ -126,7 +124,6 @@ pub async fn libpq_listener_main(
None,
None,
"serving compute connection task",
false,
page_service_conn_main(
tenant_manager.clone(),
local_auth,
@@ -430,18 +427,6 @@ impl PageServerHandler {
.get_active_tenant_with_timeout(tenant_id, ShardSelector::First, ACTIVE_TENANT_TIMEOUT)
.await?;
// Make request tracer if needed
let mut tracer = if tenant.get_trace_read_requests() {
let connection_id = ConnectionId::generate();
let path =
tenant
.conf
.trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id);
Some(Tracer::new(path))
} else {
None
};
// switch client to COPYBOTH
pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
self.flush_cancellable(pgb, &tenant.cancel).await?;
@@ -473,11 +458,6 @@ impl PageServerHandler {
trace!("query: {copy_data_bytes:?}");
fail::fail_point!("ps::handle-pagerequest-message");
// Trace request if needed
if let Some(t) = tracer.as_mut() {
t.trace(&copy_data_bytes)
}
let neon_fe_msg =
PagestreamFeMessage::parse(&mut copy_data_bytes.reader(), protocol_version)?;
@@ -1498,66 +1478,6 @@ where
))?
}
};
} else if let Some(params) = parts.strip_prefix(&["show"]) {
// show <tenant_id>
if params.len() != 1 {
return Err(QueryError::Other(anyhow::anyhow!(
"invalid param number for config command"
)));
}
let tenant_id = TenantId::from_str(params[0])
.with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
tracing::Span::current().record("tenant_id", field::display(tenant_id));
self.check_permission(Some(tenant_id))?;
COMPUTE_COMMANDS_COUNTERS
.for_command(ComputeCommandKind::Show)
.inc();
let tenant = self
.get_active_tenant_with_timeout(
tenant_id,
ShardSelector::Zero,
ACTIVE_TENANT_TIMEOUT,
)
.await?;
pgb.write_message_noflush(&BeMessage::RowDescription(&[
RowDescriptor::int8_col(b"checkpoint_distance"),
RowDescriptor::int8_col(b"checkpoint_timeout"),
RowDescriptor::int8_col(b"compaction_target_size"),
RowDescriptor::int8_col(b"compaction_period"),
RowDescriptor::int8_col(b"compaction_threshold"),
RowDescriptor::int8_col(b"gc_horizon"),
RowDescriptor::int8_col(b"gc_period"),
RowDescriptor::int8_col(b"image_creation_threshold"),
RowDescriptor::int8_col(b"pitr_interval"),
]))?
.write_message_noflush(&BeMessage::DataRow(&[
Some(tenant.get_checkpoint_distance().to_string().as_bytes()),
Some(
tenant
.get_checkpoint_timeout()
.as_secs()
.to_string()
.as_bytes(),
),
Some(tenant.get_compaction_target_size().to_string().as_bytes()),
Some(
tenant
.get_compaction_period()
.as_secs()
.to_string()
.as_bytes(),
),
Some(tenant.get_compaction_threshold().to_string().as_bytes()),
Some(tenant.get_gc_horizon().to_string().as_bytes()),
Some(tenant.get_gc_period().as_secs().to_string().as_bytes()),
Some(tenant.get_image_creation_threshold().to_string().as_bytes()),
Some(tenant.get_pitr_interval().as_secs().to_string().as_bytes()),
]))?
.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
} else {
return Err(QueryError::Other(anyhow::anyhow!(
"unknown command {query_string}"

View File

@@ -284,6 +284,16 @@ impl Timeline {
if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
return Ok(true);
}
// then check if the database was already initialized.
// get_rel_exists can be called before dbdir is created.
let buf = version.get(self, DBDIR_KEY, ctx).await?;
let dbdirs = match DbDirectory::des(&buf).context("deserialization failure") {
Ok(dir) => Ok(dir.dbdirs),
Err(e) => Err(PageReconstructError::from(e)),
}?;
if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) {
return Ok(false);
}
// fetch directory listing
let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
let buf = version.get(self, key, ctx).await?;
@@ -522,7 +532,7 @@ impl Timeline {
ctx: &RequestContext,
) -> Result<Option<TimestampTz>, PageReconstructError> {
let mut max: Option<TimestampTz> = None;
self.map_all_timestamps(probe_lsn, ctx, |timestamp| {
self.map_all_timestamps::<()>(probe_lsn, ctx, |timestamp| {
if let Some(max_prev) = max {
max = Some(max_prev.max(timestamp));
} else {
@@ -2031,7 +2041,7 @@ mod tests {
#[tokio::test]
async fn aux_files_round_trip() -> anyhow::Result<()> {
let name = "aux_files_round_trip";
let harness = TenantHarness::create(name)?;
let harness = TenantHarness::create(name).await?;
pub const TIMELINE_ID: TimelineId =
TimelineId::from_array(hex!("11223344556677881122334455667788"));

View File

@@ -408,7 +408,6 @@ pub fn spawn<F>(
tenant_shard_id: Option<TenantShardId>,
timeline_id: Option<TimelineId>,
name: &str,
shutdown_process_on_error: bool,
future: F,
) -> PageserverTaskId
where
@@ -437,7 +436,6 @@ where
task_id,
task_cloned,
cancel,
shutdown_process_on_error,
future,
));
task_mut.join_handle = Some(join_handle);
@@ -454,82 +452,78 @@ async fn task_wrapper<F>(
task_id: u64,
task: Arc<PageServerTask>,
shutdown_token: CancellationToken,
shutdown_process_on_error: bool,
future: F,
) where
F: Future<Output = anyhow::Result<()>> + Send + 'static,
{
debug!("Starting task '{}'", task_name);
let result = SHUTDOWN_TOKEN
.scope(
shutdown_token,
CURRENT_TASK.scope(task, {
// We use AssertUnwindSafe here so that the payload function
// doesn't need to be UnwindSafe. We don't do anything after the
// unwinding that would expose us to unwind-unsafe behavior.
AssertUnwindSafe(future).catch_unwind()
}),
)
.await;
task_finish(result, task_name, task_id, shutdown_process_on_error).await;
}
async fn task_finish(
result: std::result::Result<
anyhow::Result<()>,
std::boxed::Box<dyn std::any::Any + std::marker::Send>,
>,
task_name: String,
task_id: u64,
shutdown_process_on_error: bool,
) {
// Remove our entry from the global hashmap.
let task = TASKS
.lock()
.unwrap()
.remove(&task_id)
.expect("no task in registry");
let mut shutdown_process = false;
{
// wrap the future so we log panics and errors
let tenant_shard_id = task.tenant_shard_id;
let timeline_id = task.timeline_id;
let fut = async move {
// We use AssertUnwindSafe here so that the payload function
// doesn't need to be UnwindSafe. We don't do anything after the
// unwinding that would expose us to unwind-unsafe behavior.
let result = AssertUnwindSafe(future).catch_unwind().await;
match result {
Ok(Ok(())) => {
debug!("Task '{}' exited normally", task_name);
}
Ok(Err(err)) => {
if shutdown_process_on_error {
error!(
"Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
task_name, task.tenant_shard_id, task.timeline_id, err
);
shutdown_process = true;
} else {
error!(
"Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
task_name, task.tenant_shard_id, task.timeline_id, err
);
}
error!(
"Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
task_name, tenant_shard_id, timeline_id, err
);
}
Err(err) => {
if shutdown_process_on_error {
error!(
"Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
task_name, task.tenant_shard_id, task.timeline_id, err
);
shutdown_process = true;
} else {
error!(
"Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
task_name, task.tenant_shard_id, task.timeline_id, err
);
}
error!(
"Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
task_name, tenant_shard_id, timeline_id, err
);
}
}
}
};
if shutdown_process {
std::process::exit(1);
// add the task-locals
let fut = CURRENT_TASK.scope(task, fut);
let fut = SHUTDOWN_TOKEN.scope(shutdown_token, fut);
// poll future to completion
fut.await;
// Remove our entry from the global hashmap.
TASKS
.lock()
.unwrap()
.remove(&task_id)
.expect("no task in registry");
}
pub async fn exit_on_panic_or_error<T, E>(
task_name: &'static str,
future: impl Future<Output = Result<T, E>>,
) -> T
where
E: std::fmt::Debug,
{
// We use AssertUnwindSafe here so that the payload function
// doesn't need to be UnwindSafe. We don't do anything after the
// unwinding that would expose us to unwind-unsafe behavior.
let result = AssertUnwindSafe(future).catch_unwind().await;
match result {
Ok(Ok(val)) => val,
Ok(Err(err)) => {
error!(
task_name,
"Task exited with error, exiting process: {err:?}"
);
std::process::exit(1);
}
Err(panic_obj) => {
error!(task_name, "Task panicked, exiting process: {panic_obj:?}");
std::process::exit(1);
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -137,14 +137,14 @@ impl<'a> BlockCursor<'a> {
}
/// Reserved bits for length and compression
const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
pub(super) const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
/// The maximum size of blobs we support. The highest few bits
/// are reserved for compression and other further uses.
const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
const BYTE_UNCOMPRESSED: u8 = 0x80;
const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
/// A wrapper of `VirtualFile` that allows users to write blobs.
///
@@ -390,51 +390,63 @@ impl BlobWriter<false> {
}
#[cfg(test)]
mod tests {
pub(crate) mod tests {
use super::*;
use crate::{context::DownloadBehavior, task_mgr::TaskKind, tenant::block_io::BlockReaderRef};
use camino::Utf8PathBuf;
use camino_tempfile::Utf8TempDir;
use rand::{Rng, SeedableRng};
async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
round_trip_test_compressed::<BUFFERED>(blobs, false).await
}
async fn round_trip_test_compressed<const BUFFERED: bool>(
pub(crate) async fn write_maybe_compressed<const BUFFERED: bool>(
blobs: &[Vec<u8>],
compression: bool,
) -> Result<(), Error> {
ctx: &RequestContext,
) -> Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>), Error> {
let temp_dir = camino_tempfile::tempdir()?;
let pathbuf = temp_dir.path().join("file");
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
// Write part (in block to drop the file)
let mut offsets = Vec::new();
{
let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
for blob in blobs.iter() {
let (_, res) = if compression {
wtr.write_blob_maybe_compressed(
blob.clone(),
&ctx,
ctx,
ImageCompressionAlgorithm::Zstd { level: Some(1) },
)
.await
} else {
wtr.write_blob(blob.clone(), &ctx).await
wtr.write_blob(blob.clone(), ctx).await
};
let offs = res?;
offsets.push(offs);
}
// Write out one page worth of zeros so that we can
// read again with read_blk
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], &ctx).await;
let (_, res) = wtr.write_blob(vec![0; PAGE_SZ], ctx).await;
let offs = res?;
println!("Writing final blob at offs={offs}");
wtr.flush_buffer(&ctx).await?;
wtr.flush_buffer(ctx).await?;
}
Ok((temp_dir, pathbuf, offsets))
}
let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
async fn round_trip_test_compressed<const BUFFERED: bool>(
blobs: &[Vec<u8>],
compression: bool,
) -> Result<(), Error> {
let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
let (_temp_dir, pathbuf, offsets) =
write_maybe_compressed::<BUFFERED>(blobs, compression, &ctx).await?;
let file = VirtualFile::open(pathbuf, &ctx).await?;
let rdr = BlockReaderRef::VirtualFile(&file);
let rdr = BlockCursor::new_with_compression(rdr, compression);
for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
@@ -447,7 +459,7 @@ mod tests {
Ok(())
}
fn random_array(len: usize) -> Vec<u8> {
pub(crate) fn random_array(len: usize) -> Vec<u8> {
let mut rng = rand::thread_rng();
(0..len).map(|_| rng.gen()).collect::<_>()
}

View File

@@ -335,7 +335,6 @@ pub struct TenantConf {
/// A lagging safekeeper will be changed after `lagging_wal_timeout` time elapses since the last WAL update,
/// to avoid eager reconnects.
pub max_lsn_wal_lag: NonZeroU64,
pub trace_read_requests: bool,
pub eviction_policy: EvictionPolicy,
pub min_resident_size_override: Option<u64>,
// See the corresponding metric's help string.
@@ -436,10 +435,6 @@ pub struct TenantConfOpt {
#[serde(default)]
pub max_lsn_wal_lag: Option<NonZeroU64>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub trace_read_requests: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
#[serde(default)]
pub eviction_policy: Option<EvictionPolicy>,
@@ -519,9 +514,6 @@ impl TenantConfOpt {
.lagging_wal_timeout
.unwrap_or(global_conf.lagging_wal_timeout),
max_lsn_wal_lag: self.max_lsn_wal_lag.unwrap_or(global_conf.max_lsn_wal_lag),
trace_read_requests: self
.trace_read_requests
.unwrap_or(global_conf.trace_read_requests),
eviction_policy: self.eviction_policy.unwrap_or(global_conf.eviction_policy),
min_resident_size_override: self
.min_resident_size_override
@@ -581,7 +573,6 @@ impl Default for TenantConf {
.expect("cannot parse default walreceiver lagging wal timeout"),
max_lsn_wal_lag: NonZeroU64::new(DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG)
.expect("cannot parse default max walreceiver Lsn wal lag"),
trace_read_requests: false,
eviction_policy: EvictionPolicy::NoEviction,
min_resident_size_override: None,
evictions_low_residence_duration_metric_threshold: humantime::parse_duration(
@@ -659,7 +650,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime),
lagging_wal_timeout: value.lagging_wal_timeout.map(humantime),
max_lsn_wal_lag: value.max_lsn_wal_lag,
trace_read_requests: value.trace_read_requests,
eviction_policy: value.eviction_policy,
min_resident_size_override: value.min_resident_size_override,
evictions_low_residence_duration_metric_threshold: value

View File

@@ -262,7 +262,7 @@ where
pub fn iter<'a>(self, start_key: &'a [u8; L], ctx: &'a RequestContext) -> DiskBtreeIterator<'a>
where
R: 'a,
R: 'a + Send,
{
DiskBtreeIterator {
stream: Box::pin(self.into_stream(start_key, ctx)),
@@ -521,7 +521,7 @@ where
pub struct DiskBtreeIterator<'a> {
#[allow(clippy::type_complexity)]
stream: std::pin::Pin<
Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a>,
Box<dyn Stream<Item = std::result::Result<(Vec<u8>, u64), DiskBtreeError>> + 'a + Send>,
>,
}
@@ -550,10 +550,10 @@ where
/// We maintain the length of the stack to be always greater than zero.
/// Two exceptions are:
/// 1. `Self::flush_node`. The method will push the new node if it extracted the last one.
/// So because other methods cannot see the intermediate state invariant still holds.
/// So because other methods cannot see the intermediate state invariant still holds.
/// 2. `Self::finish`. It consumes self and does not return it back,
/// which means that this is where the structure is destroyed.
/// Thus stack of zero length cannot be observed by other methods.
/// which means that this is where the structure is destroyed.
/// Thus stack of zero length cannot be observed by other methods.
stack: Vec<BuildNode<L>>,
/// Last key that was appended to the tree. Used to sanity check that append

View File

@@ -463,7 +463,7 @@ impl LayerMap {
pub(self) fn insert_historic_noflush(&mut self, layer_desc: PersistentLayerDesc) {
// TODO: See #3869, resulting #4088, attempted fix and repro #4094
if Self::is_l0(&layer_desc) {
if Self::is_l0(&layer_desc.key_range) {
self.l0_delta_layers.push(layer_desc.clone().into());
}
@@ -482,7 +482,7 @@ impl LayerMap {
self.historic
.remove(historic_layer_coverage::LayerKey::from(layer_desc));
let layer_key = layer_desc.key();
if Self::is_l0(layer_desc) {
if Self::is_l0(&layer_desc.key_range) {
let len_before = self.l0_delta_layers.len();
let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
l0_delta_layers.retain(|other| other.key() != layer_key);
@@ -598,8 +598,9 @@ impl LayerMap {
coverage
}
pub fn is_l0(layer: &PersistentLayerDesc) -> bool {
layer.get_key_range() == (Key::MIN..Key::MAX)
/// Check if the key range resembles that of an L0 layer.
pub fn is_l0(key_range: &Range<Key>) -> bool {
key_range == &(Key::MIN..Key::MAX)
}
/// This function determines which layers are counted in `count_deltas`:
@@ -626,7 +627,7 @@ impl LayerMap {
/// than just the current partition_range.
pub fn is_reimage_worthy(layer: &PersistentLayerDesc, partition_range: &Range<Key>) -> bool {
// Case 1
if !Self::is_l0(layer) {
if !Self::is_l0(&layer.key_range) {
return true;
}

View File

@@ -36,7 +36,7 @@ use crate::control_plane_client::{
use crate::deletion_queue::DeletionQueueClient;
use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
use crate::task_mgr::{self, TaskKind};
use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
use crate::tenant::config::{
AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
};
@@ -225,26 +225,98 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
Ok(tmp_path)
}
/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
/// the background, and thereby avoid blocking any API requests on this deletion completing.
fn spawn_background_purge(tmp_path: Utf8PathBuf) {
// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
// After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
let task_tenant_id = None;
/// See [`Self::spawn`].
#[derive(Clone)]
pub struct BackgroundPurges(Arc<std::sync::Mutex<BackgroundPurgesInner>>);
enum BackgroundPurgesInner {
Open(tokio::task::JoinSet<()>),
// we use the async mutex for coalescing
ShuttingDown(Arc<tokio::sync::Mutex<tokio::task::JoinSet<()>>>),
}
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::MgmtRequest,
task_tenant_id,
None,
"tenant_files_delete",
false,
async move {
fs::remove_dir_all(tmp_path.as_path())
.await
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))
},
);
impl Default for BackgroundPurges {
fn default() -> Self {
Self(Arc::new(std::sync::Mutex::new(
BackgroundPurgesInner::Open(JoinSet::new()),
)))
}
}
impl BackgroundPurges {
/// When we have moved a tenant's content to a temporary directory, we may delete it lazily in
/// the background, and thereby avoid blocking any API requests on this deletion completing.
///
/// Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
/// Thus the [`BackgroundPurges`] type to keep track of these tasks.
pub fn spawn(&self, tmp_path: Utf8PathBuf) {
let mut guard = self.0.lock().unwrap();
let jset = match &mut *guard {
BackgroundPurgesInner::Open(ref mut jset) => jset,
BackgroundPurgesInner::ShuttingDown(_) => {
warn!("trying to spawn background purge during shutdown, ignoring");
return;
}
};
jset.spawn_on(
async move {
if let Err(error) = fs::remove_dir_all(tmp_path.as_path()).await {
// should we fatal_io_error here?
warn!(%error, path=%tmp_path, "failed to purge tenant directory");
}
}
.instrument(info_span!(parent: None, "background_purge")),
BACKGROUND_RUNTIME.handle(),
);
}
/// When this future completes, all background purges have completed.
/// The first poll of the future will already lock out new background purges spawned via [`Self::spawn`].
///
/// Concurrent calls will coalesce.
///
/// # Cancellation-Safety
///
/// If this future is dropped before polled to completion, concurrent and subsequent
/// instances of this future will continue to be correct.
#[instrument(skip_all)]
pub async fn shutdown(&self) {
let jset = {
let mut guard = self.0.lock().unwrap();
match &mut *guard {
BackgroundPurgesInner::Open(jset) => {
*guard = BackgroundPurgesInner::ShuttingDown(Arc::new(tokio::sync::Mutex::new(
std::mem::take(jset),
)))
}
BackgroundPurgesInner::ShuttingDown(_) => {
// calling shutdown multiple times is most likely a bug in pageserver shutdown code
warn!("already shutting down");
}
};
match &mut *guard {
BackgroundPurgesInner::ShuttingDown(ref mut jset) => jset.clone(),
BackgroundPurgesInner::Open(_) => {
unreachable!("above code transitions into shut down state");
}
}
};
let mut jset = jset.lock().await; // concurrent callers coalesce here
while let Some(res) = jset.join_next().await {
match res {
Ok(()) => {}
Err(e) if e.is_panic() => {
// If it panicked, the error is already logged by the panic hook.
}
Err(e) if e.is_cancelled() => {
unreachable!("we don't cancel the joinset or runtime")
}
Err(e) => {
// No idea when this can happen, but let's log it.
warn!(%e, "background purge task failed or panicked");
}
}
}
}
}
static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
@@ -270,6 +342,8 @@ pub struct TenantManager {
// tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or
// when the tenant detaches.
cancel: CancellationToken,
background_purges: BackgroundPurges,
}
fn emergency_generations(
@@ -447,6 +521,7 @@ pub(crate) enum DeleteTenantError {
#[instrument(skip_all)]
pub async fn init_tenant_mgr(
conf: &'static PageServerConf,
background_purges: BackgroundPurges,
resources: TenantSharedResources,
init_order: InitializationOrder,
cancel: CancellationToken,
@@ -512,7 +587,7 @@ pub async fn init_tenant_mgr(
match safe_rename_tenant_dir(&tenant_dir_path).await {
Ok(tmp_path) => {
spawn_background_purge(tmp_path);
background_purges.spawn(tmp_path);
}
Err(e) => {
error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
@@ -634,6 +709,7 @@ pub async fn init_tenant_mgr(
tenants: &TENANTS,
resources,
cancel: CancellationToken::new(),
background_purges,
})
}
@@ -1353,6 +1429,7 @@ impl TenantManager {
async fn delete_local(
conf: &PageServerConf,
background_purges: &BackgroundPurges,
tenant_shard_id: &TenantShardId,
) -> anyhow::Result<()> {
let local_tenant_directory = conf.tenant_path(tenant_shard_id);
@@ -1361,7 +1438,7 @@ impl TenantManager {
.with_context(|| {
format!("local tenant directory {local_tenant_directory:?} rename")
})?;
spawn_background_purge(tmp_dir);
background_purges.spawn(tmp_dir);
Ok(())
}
@@ -1379,12 +1456,12 @@ impl TenantManager {
barrier.wait().await;
}
}
delete_local(self.conf, &tenant_shard_id).await?;
delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?;
}
Some(TenantSlot::Secondary(secondary_tenant)) => {
secondary_tenant.shutdown().await;
delete_local(self.conf, &tenant_shard_id).await?;
delete_local(self.conf, &self.background_purges, &tenant_shard_id).await?;
}
Some(TenantSlot::InProgress(_)) => unreachable!(),
None => {}
@@ -1655,7 +1732,7 @@ impl TenantManager {
let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
.await
.with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
spawn_background_purge(tmp_path);
self.background_purges.spawn(tmp_path);
fail::fail_point!("shard-split-pre-finish", |_| Err(anyhow::anyhow!(
"failpoint"
@@ -1831,7 +1908,7 @@ impl TenantManager {
let tmp_path = self
.detach_tenant0(conf, tenant_shard_id, deletion_queue_client)
.await?;
spawn_background_purge(tmp_path);
self.background_purges.spawn(tmp_path);
Ok(())
}
@@ -2698,7 +2775,9 @@ mod tests {
// Test that if an InProgress tenant is in the map during shutdown, the shutdown will gracefully
// wait for it to complete before proceeding.
let h = TenantHarness::create("shutdown_awaits_in_progress_tenant").unwrap();
let h = TenantHarness::create("shutdown_awaits_in_progress_tenant")
.await
.unwrap();
let (t, _ctx) = h.load().await;
// harness loads it to active, which is forced and nothing is running on the tenant

View File

@@ -241,7 +241,7 @@ use self::index::IndexPart;
use super::metadata::MetadataUpdate;
use super::storage_layer::{Layer, LayerName, ResidentLayer};
use super::upload_queue::SetDeletedFlagProgress;
use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
use super::Generation;
pub(crate) use download::{
@@ -1525,7 +1525,6 @@ impl RemoteTimelineClient {
Some(self.tenant_shard_id),
Some(self.timeline_id),
"remote upload",
false,
async move {
self_rc.perform_upload_task(task).await;
Ok(())
@@ -1930,6 +1929,31 @@ impl RemoteTimelineClient {
}
}
}
/// Returns an accessor which will hold the UploadQueue mutex for accessing the upload queue
/// externally to RemoteTimelineClient.
pub(crate) fn initialized_upload_queue(
&self,
) -> Result<UploadQueueAccessor<'_>, NotInitialized> {
let mut inner = self.upload_queue.lock().unwrap();
inner.initialized_mut()?;
Ok(UploadQueueAccessor { inner })
}
}
pub(crate) struct UploadQueueAccessor<'a> {
inner: std::sync::MutexGuard<'a, UploadQueue>,
}
impl<'a> UploadQueueAccessor<'a> {
pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart {
match &*self.inner {
UploadQueue::Initialized(x) => &x.clean.0,
UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
unreachable!("checked before constructing")
}
}
}
}
pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
@@ -2103,7 +2127,7 @@ mod tests {
impl TestSetup {
async fn new(test_name: &str) -> anyhow::Result<Self> {
let test_name = Box::leak(Box::new(format!("remote_timeline_client__{test_name}")));
let harness = TenantHarness::create(test_name)?;
let harness = TenantHarness::create(test_name).await?;
let (tenant, ctx) = harness.load().await;
let timeline = tenant

View File

@@ -176,6 +176,24 @@ pub(crate) struct Lineage {
///
/// If you are adding support for detaching from a hierarchy, consider changing the ancestry
/// into a `Vec<(TimelineId, Lsn)>` to be a path instead.
// FIXME: this is insufficient even for path of two timelines for future wal recovery
// purposes:
//
// assuming a "old main" which has received most of the WAL, and has a branch "new main",
// starting a bit before "old main" last_record_lsn. the current version works fine,
// because we will know to replay wal and branch at the recorded Lsn to do wal recovery.
//
// then assuming "new main" would similarly receive a branch right before its last_record_lsn,
// "new new main". the current implementation would just store ("new main", ancestor_lsn, _)
// here. however, we cannot recover from WAL using only that information, we would need the
// whole ancestry here:
//
// ```json
// [
// ["old main", ancestor_lsn("new main"), _],
// ["new main", ancestor_lsn("new new main"), _]
// ]
// ```
#[serde(skip_serializing_if = "Option::is_none", default)]
original_ancestor: Option<(TimelineId, Lsn, NaiveDateTime)>,
}
@@ -217,6 +235,14 @@ impl Lineage {
self.original_ancestor
.is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
}
pub(crate) fn is_detached_from_original_ancestor(&self) -> bool {
self.original_ancestor.is_some()
}
pub(crate) fn is_reparented(&self) -> bool {
!self.reparenting_history.is_empty()
}
}
#[cfg(test)]

View File

@@ -31,6 +31,7 @@ use pageserver_api::{
};
use remote_storage::GenericRemoteStorage;
use tokio::task::JoinHandle;
use tokio_util::sync::CancellationToken;
use tracing::instrument;
use utils::{completion::Barrier, id::TimelineId, sync::gate::Gate};
@@ -293,15 +294,50 @@ impl SecondaryController {
}
}
pub struct GlobalTasks {
cancel: CancellationToken,
uploader: JoinHandle<()>,
downloader: JoinHandle<()>,
}
impl GlobalTasks {
/// Caller is responsible for requesting shutdown via the cancellation token that was
/// passed to [`spawn_tasks`].
///
/// # Panics
///
/// This method panics if that token is not cancelled.
/// This is low-risk because we're calling this during process shutdown, so, a panic
/// will be informative but not cause undue downtime.
pub async fn wait(self) {
let Self {
cancel,
uploader,
downloader,
} = self;
assert!(
cancel.is_cancelled(),
"must cancel cancellation token, otherwise the tasks will not shut down"
);
let (uploader, downloader) = futures::future::join(uploader, downloader).await;
uploader.expect(
"unreachable: exit_on_panic_or_error would catch the panic and exit the process",
);
downloader.expect(
"unreachable: exit_on_panic_or_error would catch the panic and exit the process",
);
}
}
pub fn spawn_tasks(
tenant_manager: Arc<TenantManager>,
remote_storage: GenericRemoteStorage,
background_jobs_can_start: Barrier,
cancel: CancellationToken,
) -> SecondaryController {
) -> (SecondaryController, GlobalTasks) {
let mgr_clone = tenant_manager.clone();
let storage_clone = remote_storage.clone();
let cancel_clone = cancel.clone();
let bg_jobs_clone = background_jobs_can_start.clone();
let (download_req_tx, download_req_rx) =
@@ -309,17 +345,9 @@ pub fn spawn_tasks(
let (upload_req_tx, upload_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
let downloader_task_ctx = RequestContext::new(
TaskKind::SecondaryDownloads,
crate::context::DownloadBehavior::Download,
);
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
downloader_task_ctx.task_kind(),
None,
None,
let cancel_clone = cancel.clone();
let downloader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
"secondary tenant downloads",
false,
async move {
downloader_task(
mgr_clone,
@@ -327,49 +355,41 @@ pub fn spawn_tasks(
download_req_rx,
bg_jobs_clone,
cancel_clone,
downloader_task_ctx,
RequestContext::new(
TaskKind::SecondaryDownloads,
crate::context::DownloadBehavior::Download,
),
)
.await;
Ok(())
anyhow::Ok(())
},
);
));
task_mgr::spawn(
BACKGROUND_RUNTIME.handle(),
TaskKind::SecondaryUploads,
None,
None,
let cancel_clone = cancel.clone();
let uploader = BACKGROUND_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
"heatmap uploads",
false,
async move {
heatmap_uploader_task(
tenant_manager,
remote_storage,
upload_req_rx,
background_jobs_can_start,
cancel,
cancel_clone,
)
.await;
Ok(())
anyhow::Ok(())
},
);
));
SecondaryController {
download_req_tx,
upload_req_tx,
}
}
/// For running with remote storage disabled: a SecondaryController that is connected to nothing.
pub fn null_controller() -> SecondaryController {
let (download_req_tx, _download_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
let (upload_req_tx, _upload_req_rx) =
tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
SecondaryController {
upload_req_tx,
download_req_tx,
}
(
SecondaryController {
upload_req_tx,
download_req_tx,
},
GlobalTasks {
cancel,
uploader,
downloader,
},
)
}

View File

@@ -135,11 +135,9 @@ pub struct TimelineInputs {
ancestor_lsn: Lsn,
last_record: Lsn,
latest_gc_cutoff: Lsn,
horizon_cutoff: Lsn,
pitr_cutoff: Lsn,
/// Cutoff point based on GC settings
next_gc_cutoff: Lsn,
next_pitr_cutoff: Lsn,
/// Cutoff point calculated from the user-supplied 'max_retention_period'
retention_param_cutoff: Option<Lsn>,
@@ -150,7 +148,7 @@ pub struct TimelineInputs {
/// Gathers the inputs for the tenant sizing model.
///
/// Tenant size does not consider the latest state, but only the state until next_gc_cutoff, which
/// Tenant size does not consider the latest state, but only the state until next_pitr_cutoff, which
/// is updated on-demand, during the start of this calculation and separate from the
/// [`TimelineInputs::latest_gc_cutoff`].
///
@@ -158,11 +156,8 @@ pub struct TimelineInputs {
///
/// ```text
/// 0-----|---------|----|------------| · · · · · |·> lsn
/// initdb_lsn branchpoints* next_gc_cutoff latest
/// initdb_lsn branchpoints* next_pitr_cutoff latest
/// ```
///
/// Until gc_horizon_cutoff > `Timeline::last_record_lsn` for any of the tenant's timelines, the
/// tenant size will be zero.
pub(super) async fn gather_inputs(
tenant: &Tenant,
limit: &Arc<Semaphore>,
@@ -172,7 +167,7 @@ pub(super) async fn gather_inputs(
cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<ModelInputs, CalculateSyntheticSizeError> {
// refresh is needed to update gc related pitr_cutoff and horizon_cutoff
// refresh is needed to update [`timeline::GcCutoffs`]
tenant.refresh_gc_info(cancel, ctx).await?;
// Collect information about all the timelines
@@ -236,20 +231,18 @@ pub(super) async fn gather_inputs(
// we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
// actually removing files.
//
// We only consider [`GcInfo::pitr_cutoff`], and not [`GcInfo::horizon_cutoff`], because from
// We only consider [`timeline::GcCutoffs::time`], and not [`timeline::GcCutoffs::space`], because from
// a user's perspective they have only requested retention up to the time bound (pitr_cutoff), rather
// than a space bound (horizon cutoff). This means that if someone drops a database and waits for their
// than our internal space cutoff. This means that if someone drops a database and waits for their
// PITR interval, they will see synthetic size decrease, even if we are still storing data inside
// horizon_cutoff.
let pitr_cutoff = gc_info.cutoffs.pitr;
let horizon_cutoff = gc_info.cutoffs.horizon;
let mut next_gc_cutoff = pitr_cutoff;
// the space cutoff.
let mut next_pitr_cutoff = gc_info.cutoffs.time;
// If the caller provided a shorter retention period, use that instead of the GC cutoff.
let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period));
if next_gc_cutoff < param_cutoff {
next_gc_cutoff = param_cutoff;
if next_pitr_cutoff < param_cutoff {
next_pitr_cutoff = param_cutoff;
}
Some(param_cutoff)
} else {
@@ -263,7 +256,7 @@ pub(super) async fn gather_inputs(
.copied()
.collect::<Vec<_>>();
// next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
// next_pitr_cutoff in parent branch are not of interest (right now at least), nor do we
// want to query any logical size before initdb_lsn.
let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);
@@ -271,10 +264,10 @@ pub(super) async fn gather_inputs(
let mut lsns: Vec<(Lsn, LsnKind)> = gc_info
.retain_lsns
.iter()
.filter(|&&lsn| lsn > ancestor_lsn)
.filter(|(lsn, _child_id)| lsn > &ancestor_lsn)
.copied()
// this assumes there are no other retain_lsns than the branchpoints
.map(|lsn| (lsn, LsnKind::BranchPoint))
.map(|(lsn, _child_id)| (lsn, LsnKind::BranchPoint))
.collect::<Vec<_>>();
lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
@@ -291,10 +284,10 @@ pub(super) async fn gather_inputs(
)
}
// Add a point for the GC cutoff
let branch_start_needed = next_gc_cutoff <= branch_start_lsn;
// Add a point for the PITR cutoff
let branch_start_needed = next_pitr_cutoff <= branch_start_lsn;
if !branch_start_needed {
lsns.push((next_gc_cutoff, LsnKind::GcCutOff));
lsns.push((next_pitr_cutoff, LsnKind::GcCutOff));
}
lsns.sort_unstable();
@@ -333,7 +326,7 @@ pub(super) async fn gather_inputs(
parent: Some(parent),
lsn: lsn.0,
size: None,
needed: lsn > next_gc_cutoff,
needed: lsn > next_pitr_cutoff,
},
timeline_id: timeline.timeline_id,
kind,
@@ -357,8 +350,8 @@ pub(super) async fn gather_inputs(
segment: Segment {
parent: Some(lease_parent),
lsn: lsn.0,
size: None, // Filled in later, if necessary
needed: lsn > next_gc_cutoff, // only needed if the point is within rentention.
size: None, // Filled in later, if necessary
needed: lsn > next_pitr_cutoff, // only needed if the point is within rentention.
},
timeline_id: timeline.timeline_id,
kind: LsnKind::LeaseStart,
@@ -398,9 +391,7 @@ pub(super) async fn gather_inputs(
last_record: last_record_lsn,
// this is not used above, because it might not have updated recently enough
latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
horizon_cutoff,
pitr_cutoff,
next_gc_cutoff,
next_pitr_cutoff,
retention_param_cutoff,
lease_points,
});
@@ -742,9 +733,7 @@ fn verify_size_for_multiple_branches() {
"ancestor_lsn": "0/18D3D98",
"last_record": "0/2230CD0",
"latest_gc_cutoff": "0/1698C48",
"horizon_cutoff": "0/2210CD0",
"pitr_cutoff": "0/2210CD0",
"next_gc_cutoff": "0/2210CD0",
"next_pitr_cutoff": "0/2210CD0",
"retention_param_cutoff": null,
"lease_points": []
},
@@ -753,9 +742,7 @@ fn verify_size_for_multiple_branches() {
"ancestor_lsn": "0/176D998",
"last_record": "0/1837770",
"latest_gc_cutoff": "0/1698C48",
"horizon_cutoff": "0/1817770",
"pitr_cutoff": "0/1817770",
"next_gc_cutoff": "0/1817770",
"next_pitr_cutoff": "0/1817770",
"retention_param_cutoff": null,
"lease_points": []
},
@@ -764,9 +751,7 @@ fn verify_size_for_multiple_branches() {
"ancestor_lsn": "0/0",
"last_record": "0/18D3D98",
"latest_gc_cutoff": "0/1698C48",
"horizon_cutoff": "0/18B3D98",
"pitr_cutoff": "0/18B3D98",
"next_gc_cutoff": "0/18B3D98",
"next_pitr_cutoff": "0/18B3D98",
"retention_param_cutoff": null,
"lease_points": []
}
@@ -820,9 +805,7 @@ fn verify_size_for_one_branch() {
"ancestor_lsn": "0/0",
"last_record": "47/280A5860",
"latest_gc_cutoff": "47/240A5860",
"horizon_cutoff": "47/240A5860",
"pitr_cutoff": "47/240A5860",
"next_gc_cutoff": "47/240A5860",
"next_pitr_cutoff": "47/240A5860",
"retention_param_cutoff": "0/0",
"lease_points": []
}

View File

@@ -6,35 +6,22 @@ pub(crate) mod inmemory_layer;
pub(crate) mod layer;
mod layer_desc;
mod layer_name;
#[cfg(test)]
pub mod merge_iterator;
use crate::context::{AccessStatsBehavior, RequestContext};
use crate::repository::Value;
use crate::task_mgr::TaskKind;
use crate::walrecord::NeonWalRecord;
use bytes::Bytes;
use enum_map::EnumMap;
use enumset::EnumSet;
use once_cell::sync::Lazy;
use pageserver_api::key::Key;
use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
use pageserver_api::models::{
LayerAccessKind, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
};
use std::borrow::Cow;
use std::cmp::{Ordering, Reverse};
use std::collections::hash_map::Entry;
use std::collections::{BinaryHeap, HashMap};
use std::ops::Range;
use std::sync::{Arc, Mutex};
use std::sync::Arc;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use tracing::warn;
use utils::history_buffer::HistoryBufferWithDropCounter;
use utils::rate_limit::RateLimit;
use utils::{id::TimelineId, lsn::Lsn};
use utils::lsn::Lsn;
pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
pub use image_layer::{ImageLayer, ImageLayerWriter};
@@ -77,9 +64,9 @@ where
/// call, to collect more records.
///
#[derive(Debug, Default)]
pub struct ValueReconstructState {
pub records: Vec<(Lsn, NeonWalRecord)>,
pub img: Option<(Lsn, Bytes)>,
pub(crate) struct ValueReconstructState {
pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
pub(crate) img: Option<(Lsn, Bytes)>,
}
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
@@ -460,94 +447,92 @@ pub enum ValueReconstructResult {
Missing,
}
#[derive(Debug)]
pub struct LayerAccessStats(Mutex<LayerAccessStatsLocked>);
/// This struct holds two instances of [`LayerAccessStatsInner`].
/// Accesses are recorded to both instances.
/// The `for_scraping_api`instance can be reset from the management API via [`LayerAccessStatsReset`].
/// The `for_eviction_policy` is never reset.
#[derive(Debug, Default, Clone)]
struct LayerAccessStatsLocked {
for_scraping_api: LayerAccessStatsInner,
for_eviction_policy: LayerAccessStatsInner,
/// Layers contain a hint indicating whether they are likely to be used for reads. This is a hint rather
/// than an authoritative value, so that we do not have to update it synchronously when changing the visibility
/// of layers (for example when creating a branch that makes some previously covered layers visible). It should
/// be used for cache management but not for correctness-critical checks.
#[derive(Default, Debug, Clone, PartialEq, Eq)]
pub(crate) enum LayerVisibilityHint {
/// A Visible layer might be read while serving a read, because there is not an image layer between it
/// and a readable LSN (the tip of the branch or a child's branch point)
Visible,
/// A Covered layer probably won't be read right now, but _can_ be read in future if someone creates
/// a branch or ephemeral endpoint at an LSN below the layer that covers this.
#[allow(unused)]
Covered,
/// Calculating layer visibilty requires I/O, so until this has happened layers are loaded
/// in this state. Note that newly written layers may be called Visible immediately, this uninitialized
/// state is for when existing layers are constructed while loading a timeline.
#[default]
Uninitialized,
}
impl LayerAccessStatsLocked {
fn iter_mut(&mut self) -> impl Iterator<Item = &mut LayerAccessStatsInner> {
[&mut self.for_scraping_api, &mut self.for_eviction_policy].into_iter()
}
}
#[derive(Debug, Default, Clone)]
struct LayerAccessStatsInner {
first_access: Option<LayerAccessStatFullDetails>,
count_by_access_kind: EnumMap<LayerAccessKind, u64>,
task_kind_flag: EnumSet<TaskKind>,
last_accesses: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
last_residence_changes: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
}
#[derive(Debug, Clone, Copy)]
pub(crate) struct LayerAccessStatFullDetails {
pub(crate) when: SystemTime,
pub(crate) task_kind: TaskKind,
pub(crate) access_kind: LayerAccessKind,
}
pub(crate) struct LayerAccessStats(std::sync::atomic::AtomicU64);
#[derive(Clone, Copy, strum_macros::EnumString)]
pub enum LayerAccessStatsReset {
pub(crate) enum LayerAccessStatsReset {
NoReset,
JustTaskKindFlags,
AllStats,
}
fn system_time_to_millis_since_epoch(ts: &SystemTime) -> u64 {
ts.duration_since(UNIX_EPOCH)
.expect("better to die in this unlikely case than report false stats")
.as_millis()
.try_into()
.expect("64 bits is enough for few more years")
}
impl Default for LayerAccessStats {
fn default() -> Self {
// Default value is to assume resident since creation time, and visible.
let (_mask, mut value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, SystemTime::now());
value |= 0x1 << Self::VISIBILITY_SHIFT;
impl LayerAccessStatFullDetails {
fn as_api_model(&self) -> pageserver_api::models::LayerAccessStatFullDetails {
let Self {
when,
task_kind,
access_kind,
} = self;
pageserver_api::models::LayerAccessStatFullDetails {
when_millis_since_epoch: system_time_to_millis_since_epoch(when),
task_kind: Cow::Borrowed(task_kind.into()), // into static str, powered by strum_macros
access_kind: *access_kind,
}
Self(std::sync::atomic::AtomicU64::new(value))
}
}
// Efficient store of two very-low-resolution timestamps and some bits. Used for storing last access time and
// last residence change time.
impl LayerAccessStats {
/// Create an empty stats object.
///
/// The caller is responsible for recording a residence event
/// using [`record_residence_event`] before calling `latest_activity`.
/// If they don't, [`latest_activity`] will return `None`.
///
/// [`record_residence_event`]: Self::record_residence_event
/// [`latest_activity`]: Self::latest_activity
pub(crate) fn empty_will_record_residence_event_later() -> Self {
LayerAccessStats(Mutex::default())
// How many high bits to drop from a u32 timestamp?
// - Only storing up to a u32 timestamp will work fine until 2038 (if this code is still in use
// after that, this software has been very successful!)
// - Dropping the top bit is implicitly safe because unix timestamps are meant to be
// stored in an i32, so they never used it.
// - Dropping the next two bits is safe because this code is only running on systems in
// years >= 2024, and these bits have been 1 since 2021
//
// Therefore we may store only 28 bits for a timestamp with one second resolution. We do
// this truncation to make space for some flags in the high bits of our u64.
const TS_DROP_HIGH_BITS: u32 = u32::count_ones(Self::TS_ONES) + 1;
const TS_MASK: u32 = 0x1f_ff_ff_ff;
const TS_ONES: u32 = 0x60_00_00_00;
const ATIME_SHIFT: u32 = 0;
const RTIME_SHIFT: u32 = 32 - Self::TS_DROP_HIGH_BITS;
const VISIBILITY_SHIFT: u32 = 64 - 2 * Self::TS_DROP_HIGH_BITS;
fn write_bits(&self, mask: u64, value: u64) -> u64 {
self.0
.fetch_update(
// TODO: decide what orderings are correct
std::sync::atomic::Ordering::Relaxed,
std::sync::atomic::Ordering::Relaxed,
|v| Some((v & !mask) | (value & mask)),
)
.expect("Inner function is infallible")
}
/// Create an empty stats object and record a [`LayerLoad`] event with the given residence status.
///
/// See [`record_residence_event`] for why you need to do this while holding the layer map lock.
///
/// [`LayerLoad`]: LayerResidenceEventReason::LayerLoad
/// [`record_residence_event`]: Self::record_residence_event
pub(crate) fn for_loading_layer(status: LayerResidenceStatus) -> Self {
let new = LayerAccessStats(Mutex::new(LayerAccessStatsLocked::default()));
new.record_residence_event(status, LayerResidenceEventReason::LayerLoad);
new
fn to_low_res_timestamp(shift: u32, time: SystemTime) -> (u64, u64) {
// Drop the low three bits of the timestamp, for an ~8s accuracy
let timestamp = time.duration_since(UNIX_EPOCH).unwrap().as_secs() & (Self::TS_MASK as u64);
((Self::TS_MASK as u64) << shift, timestamp << shift)
}
fn read_low_res_timestamp(&self, shift: u32) -> Option<SystemTime> {
let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
let ts_bits = (read & ((Self::TS_MASK as u64) << shift)) >> shift;
if ts_bits == 0 {
None
} else {
Some(UNIX_EPOCH + Duration::from_secs(ts_bits | (Self::TS_ONES as u64)))
}
}
/// Record a change in layer residency.
@@ -563,129 +548,112 @@ impl LayerAccessStats {
/// - Eviction: imitate access logical size calculation. This accesses the L0 layers because the L1 layer is not yet in the layer map.
/// - Compact: Grab layer map lock, add the new L1 to layer map and remove the L0s, release layer map lock.
/// - Eviction: observes the new L1 layer whose only activity timestamp is the LayerCreate event.
///
pub(crate) fn record_residence_event(
&self,
status: LayerResidenceStatus,
reason: LayerResidenceEventReason,
) {
let mut locked = self.0.lock().unwrap();
locked.iter_mut().for_each(|inner| {
inner
.last_residence_changes
.write(LayerResidenceEvent::new(status, reason))
});
pub(crate) fn record_residence_event_at(&self, now: SystemTime) {
let (mask, value) = Self::to_low_res_timestamp(Self::RTIME_SHIFT, now);
self.write_bits(mask, value);
}
fn record_access(&self, access_kind: LayerAccessKind, ctx: &RequestContext) {
pub(crate) fn record_residence_event(&self) {
self.record_residence_event_at(SystemTime::now())
}
pub(crate) fn record_access_at(&self, now: SystemTime) {
let (mut mask, mut value) = Self::to_low_res_timestamp(Self::ATIME_SHIFT, now);
// A layer which is accessed must be visible.
mask |= 0x1 << Self::VISIBILITY_SHIFT;
value |= 0x1 << Self::VISIBILITY_SHIFT;
self.write_bits(mask, value);
}
pub(crate) fn record_access(&self, ctx: &RequestContext) {
if ctx.access_stats_behavior() == AccessStatsBehavior::Skip {
return;
}
let this_access = LayerAccessStatFullDetails {
when: SystemTime::now(),
task_kind: ctx.task_kind(),
access_kind,
};
let mut locked = self.0.lock().unwrap();
locked.iter_mut().for_each(|inner| {
inner.first_access.get_or_insert(this_access);
inner.count_by_access_kind[access_kind] += 1;
inner.task_kind_flag |= ctx.task_kind();
inner.last_accesses.write(this_access);
})
self.record_access_at(SystemTime::now())
}
fn as_api_model(
&self,
reset: LayerAccessStatsReset,
) -> pageserver_api::models::LayerAccessStats {
let mut locked = self.0.lock().unwrap();
let inner = &mut locked.for_scraping_api;
let LayerAccessStatsInner {
first_access,
count_by_access_kind,
task_kind_flag,
last_accesses,
last_residence_changes,
} = inner;
let ret = pageserver_api::models::LayerAccessStats {
access_count_by_access_kind: count_by_access_kind
.iter()
.map(|(kind, count)| (kind, *count))
.collect(),
task_kind_access_flag: task_kind_flag
.iter()
.map(|task_kind| Cow::Borrowed(task_kind.into())) // into static str, powered by strum_macros
.collect(),
first: first_access.as_ref().map(|a| a.as_api_model()),
accesses_history: last_accesses.map(|m| m.as_api_model()),
residence_events_history: last_residence_changes.clone(),
access_time: self
.read_low_res_timestamp(Self::ATIME_SHIFT)
.unwrap_or(UNIX_EPOCH),
residence_time: self
.read_low_res_timestamp(Self::RTIME_SHIFT)
.unwrap_or(UNIX_EPOCH),
visible: matches!(self.visibility(), LayerVisibilityHint::Visible),
};
match reset {
LayerAccessStatsReset::NoReset => (),
LayerAccessStatsReset::JustTaskKindFlags => {
inner.task_kind_flag.clear();
}
LayerAccessStatsReset::NoReset => {}
LayerAccessStatsReset::AllStats => {
*inner = LayerAccessStatsInner::default();
self.write_bits((Self::TS_MASK as u64) << Self::ATIME_SHIFT, 0x0);
self.write_bits((Self::TS_MASK as u64) << Self::RTIME_SHIFT, 0x0);
}
}
ret
}
/// Get the latest access timestamp, falling back to latest residence event, further falling
/// back to `SystemTime::now` for a usable timestamp for eviction.
pub(crate) fn latest_activity_or_now(&self) -> SystemTime {
self.latest_activity().unwrap_or_else(SystemTime::now)
/// Get the latest access timestamp, falling back to latest residence event. The latest residence event
/// will be this Layer's construction time, if its residence hasn't changed since then.
pub(crate) fn latest_activity(&self) -> SystemTime {
if let Some(t) = self.read_low_res_timestamp(Self::ATIME_SHIFT) {
t
} else {
self.read_low_res_timestamp(Self::RTIME_SHIFT)
.expect("Residence time is set on construction")
}
}
/// Get the latest access timestamp, falling back to latest residence event.
/// Whether this layer has been accessed (excluding in [`AccessStatsBehavior::Skip`]).
///
/// This function can only return `None` if there has not yet been a call to the
/// [`record_residence_event`] method. That would generally be considered an
/// implementation error. This function logs a rate-limited warning in that case.
///
/// TODO: use type system to avoid the need for `fallback`.
/// The approach in <https://github.com/neondatabase/neon/pull/3775>
/// could be used to enforce that a residence event is recorded
/// before a layer is added to the layer map. We could also have
/// a layer wrapper type that holds the LayerAccessStats, and ensure
/// that that type can only be produced by inserting into the layer map.
///
/// [`record_residence_event`]: Self::record_residence_event
fn latest_activity(&self) -> Option<SystemTime> {
let locked = self.0.lock().unwrap();
let inner = &locked.for_eviction_policy;
match inner.last_accesses.recent() {
Some(a) => Some(a.when),
None => match inner.last_residence_changes.recent() {
Some(e) => Some(e.timestamp),
None => {
static WARN_RATE_LIMIT: Lazy<Mutex<(usize, RateLimit)>> =
Lazy::new(|| Mutex::new((0, RateLimit::new(Duration::from_secs(10)))));
let mut guard = WARN_RATE_LIMIT.lock().unwrap();
guard.0 += 1;
let occurences = guard.0;
guard.1.call(move || {
warn!(parent: None, occurences, "latest_activity not available, this is an implementation bug, using fallback value");
});
None
}
},
/// This indicates whether the layer has been used for some purpose that would motivate
/// us to keep it on disk, such as for serving a getpage request.
fn accessed(&self) -> bool {
// Consider it accessed if the most recent access is more recent than
// the most recent change in residence status.
match (
self.read_low_res_timestamp(Self::ATIME_SHIFT),
self.read_low_res_timestamp(Self::RTIME_SHIFT),
) {
(None, _) => false,
(Some(_), None) => true,
(Some(a), Some(r)) => a >= r,
}
}
pub(crate) fn set_visibility(&self, visibility: LayerVisibilityHint) {
let value = match visibility {
LayerVisibilityHint::Visible => 0x1 << Self::VISIBILITY_SHIFT,
LayerVisibilityHint::Covered | LayerVisibilityHint::Uninitialized => 0x0,
};
self.write_bits(0x1 << Self::VISIBILITY_SHIFT, value);
}
pub(crate) fn visibility(&self) -> LayerVisibilityHint {
let read = self.0.load(std::sync::atomic::Ordering::Relaxed);
match (read >> Self::VISIBILITY_SHIFT) & 0x1 {
1 => LayerVisibilityHint::Visible,
0 => LayerVisibilityHint::Covered,
_ => unreachable!(),
}
}
}
/// Get a layer descriptor from a layer.
pub trait AsLayerDesc {
pub(crate) trait AsLayerDesc {
/// Get the layer descriptor.
fn layer_desc(&self) -> &PersistentLayerDesc;
}
pub mod tests {
use pageserver_api::shard::TenantShardId;
use utils::id::TimelineId;
use super::*;

View File

@@ -33,11 +33,14 @@ use crate::page_cache::{self, FileId, PAGE_SZ};
use crate::repository::{Key, Value, KEY_SIZE};
use crate::tenant::blob_io::BlobWriter;
use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
use crate::tenant::disk_btree::{
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
};
use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::vectored_blob_io::{
BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
VectoredReadPlanner,
};
use crate::tenant::{PageReconstructError, Timeline};
use crate::virtual_file::{self, VirtualFile};
@@ -49,10 +52,11 @@ use camino::{Utf8Path, Utf8PathBuf};
use futures::StreamExt;
use itertools::Itertools;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
use pageserver_api::models::ImageCompressionAlgorithm;
use pageserver_api::shard::TenantShardId;
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use std::fs::File;
use std::io::SeekFrom;
use std::ops::Range;
@@ -261,7 +265,7 @@ impl DeltaLayer {
return Ok(());
}
let inner = self.load(LayerAccessKind::Dump, ctx).await?;
let inner = self.load(ctx).await?;
inner.dump(ctx).await
}
@@ -294,12 +298,8 @@ impl DeltaLayer {
/// Open the underlying file and read the metadata into memory, if it's
/// not loaded already.
///
async fn load(
&self,
access_kind: LayerAccessKind,
ctx: &RequestContext,
) -> Result<&Arc<DeltaLayerInner>> {
self.access_stats.record_access(access_kind, ctx);
async fn load(&self, ctx: &RequestContext) -> Result<&Arc<DeltaLayerInner>> {
self.access_stats.record_access(ctx);
// Quick exit if already loaded
self.inner
.get_or_try_init(|| self.load_inner(ctx))
@@ -352,7 +352,7 @@ impl DeltaLayer {
summary.lsn_range,
metadata.len(),
),
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
access_stats: Default::default(),
inner: OnceCell::new(),
})
}
@@ -456,7 +456,12 @@ impl DeltaLayerWriterInner {
will_init: bool,
ctx: &RequestContext,
) -> (Vec<u8>, anyhow::Result<()>) {
assert!(self.lsn_range.start <= lsn);
assert!(
self.lsn_range.start <= lsn,
"lsn_start={}, lsn={}",
self.lsn_range.start,
lsn
);
// We don't want to use compression in delta layer creation
let compression = ImageCompressionAlgorithm::Disabled;
let (val, res) = self
@@ -747,12 +752,10 @@ impl DeltaLayer {
}
impl DeltaLayerInner {
#[cfg(test)]
pub(crate) fn key_range(&self) -> &Range<Key> {
&self.layer_key_range
}
#[cfg(test)]
pub(crate) fn lsn_range(&self) -> &Range<Lsn> {
&self.layer_lsn_range
}
@@ -1180,9 +1183,7 @@ impl DeltaLayerInner {
let delta_key = DeltaKey::from_slice(key);
let val_ref = ValueRef {
blob_ref: BlobRef(value),
reader: BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(
Adapter(self),
)),
layer: self,
};
let pos = BlobRef(value).pos();
if let Some(last) = all_keys.last_mut() {
@@ -1321,7 +1322,7 @@ impl DeltaLayerInner {
offsets.start.pos(),
offsets.end.pos(),
meta,
Some(max_read_size),
max_read_size,
))
}
} else {
@@ -1426,7 +1427,7 @@ impl DeltaLayerInner {
let keys = self.load_keys(ctx).await?;
async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
let buf = val.load_raw(ctx).await?;
let val = Value::des(&buf)?;
let desc = match val {
Value::Image(img) => {
@@ -1461,8 +1462,7 @@ impl DeltaLayerInner {
use pageserver_api::key::CHECKPOINT_KEY;
use postgres_ffi::CheckPoint;
if key == CHECKPOINT_KEY {
let buf = val.reader.read_blob(val.blob_ref.pos(), ctx).await?;
let val = Value::des(&buf)?;
let val = val.load(ctx).await?;
match val {
Value::Image(img) => {
let checkpoint = CheckPoint::decode(&img)?;
@@ -1515,7 +1515,6 @@ impl DeltaLayerInner {
offset
}
#[cfg(test)]
pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
let block_reader = FileBlockReader::new(&self.file, self.file_id);
let tree_reader =
@@ -1526,7 +1525,7 @@ impl DeltaLayerInner {
index_iter: tree_reader.iter(&[0; DELTA_KEY_SIZE], ctx),
key_values_batch: std::collections::VecDeque::new(),
is_end: false,
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
planner: StreamingVectoredReadPlanner::new(
1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
1024, // The default value. Unit tests might use a different value
),
@@ -1547,17 +1546,24 @@ pub struct DeltaEntry<'a> {
/// Reference to an on-disk value
pub struct ValueRef<'a> {
blob_ref: BlobRef,
reader: BlockCursor<'a>,
layer: &'a DeltaLayerInner,
}
impl<'a> ValueRef<'a> {
/// Loads the value from disk
pub async fn load(&self, ctx: &RequestContext) -> Result<Value> {
// theoretically we *could* record an access time for each, but it does not really matter
let buf = self.reader.read_blob(self.blob_ref.pos(), ctx).await?;
let buf = self.load_raw(ctx).await?;
let val = Value::des(&buf)?;
Ok(val)
}
async fn load_raw(&self, ctx: &RequestContext) -> Result<Vec<u8>> {
let reader = BlockCursor::new(crate::tenant::block_io::BlockReaderRef::Adapter(Adapter(
self.layer,
)));
let buf = reader.read_blob(self.blob_ref.pos(), ctx).await?;
Ok(buf)
}
}
pub(crate) struct Adapter<T>(T);
@@ -1591,17 +1597,15 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
}
}
#[cfg(test)]
pub struct DeltaLayerIterator<'a> {
delta_layer: &'a DeltaLayerInner,
ctx: &'a RequestContext,
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
planner: StreamingVectoredReadPlanner,
index_iter: DiskBtreeIterator<'a>,
key_values_batch: VecDeque<(Key, Lsn, Value)>,
is_end: bool,
}
#[cfg(test)]
impl<'a> DeltaLayerIterator<'a> {
/// Retrieve a batch of key-value pairs into the iterator buffer.
async fn next_batch(&mut self) -> anyhow::Result<()> {
@@ -1615,13 +1619,17 @@ impl<'a> DeltaLayerIterator<'a> {
let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
let blob_ref = BlobRef(value);
let offset = blob_ref.pos();
if let Some(batch_plan) = self.planner.handle(key, lsn, offset, BlobFlag::None) {
if let Some(batch_plan) = self.planner.handle(key, lsn, offset) {
break batch_plan;
}
} else {
self.is_end = true;
let data_end_offset = self.delta_layer.index_start_offset();
break self.planner.handle_range_end(data_end_offset);
if let Some(item) = self.planner.handle_range_end(data_end_offset) {
break item;
} else {
return Ok(()); // TODO: test empty iterator
}
}
};
let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
@@ -1664,6 +1672,7 @@ pub(crate) mod test {
use rand::RngCore;
use super::*;
use crate::repository::Value;
use crate::tenant::harness::TIMELINE_ID;
use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
use crate::tenant::Tenant;
@@ -1673,6 +1682,7 @@ pub(crate) mod test {
tenant::{disk_btree::tests::TestDisk, harness::TenantHarness},
DEFAULT_PG_VERSION,
};
use bytes::Bytes;
/// Construct an index for a fictional delta layer and and then
/// traverse in order to plan vectored reads for a query. Finally,
@@ -1925,7 +1935,7 @@ pub(crate) mod test {
#[tokio::test]
async fn test_delta_layer_vectored_read_end_to_end() -> anyhow::Result<()> {
let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read")?;
let harness = TenantHarness::create("test_delta_layer_oversized_vectored_read").await?;
let (tenant, ctx) = harness.load().await;
let timeline_id = TimelineId::generate();
@@ -2025,7 +2035,9 @@ pub(crate) mod test {
use crate::walrecord::NeonWalRecord;
use bytes::Bytes;
let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap();
let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
.await
.unwrap();
let (tenant, ctx) = h.load().await;
let ctx = &ctx;
let timeline = tenant
@@ -2241,6 +2253,15 @@ pub(crate) mod test {
(k1, l1).cmp(&(k2, l2))
}
pub(crate) fn sort_delta_value(
(k1, l1, v1): &(Key, Lsn, Value),
(k2, l2, v2): &(Key, Lsn, Value),
) -> std::cmp::Ordering {
let order_1 = if v1.is_image() { 0 } else { 1 };
let order_2 = if v2.is_image() { 0 } else { 1 };
(k1, l1, order_1).cmp(&(k2, l2, order_2))
}
pub(crate) async fn produce_delta_layer(
tenant: &Tenant,
tline: &Arc<Timeline>,
@@ -2249,7 +2270,7 @@ pub(crate) mod test {
) -> anyhow::Result<ResidentLayer> {
deltas.sort_by(sort_delta);
let (key_start, _, _) = deltas.first().unwrap();
let (key_max, _, _) = deltas.first().unwrap();
let (key_max, _, _) = deltas.last().unwrap();
let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
let lsn_end = Lsn(lsn_max.0 + 1);
@@ -2294,10 +2315,7 @@ pub(crate) mod test {
#[tokio::test]
async fn delta_layer_iterator() {
use crate::repository::Value;
use bytes::Bytes;
let harness = TenantHarness::create("delta_layer_iterator").unwrap();
let harness = TenantHarness::create("delta_layer_iterator").await.unwrap();
let (tenant, ctx) = harness.load().await;
let tline = tenant

View File

@@ -29,13 +29,16 @@ use crate::page_cache::{self, FileId, PAGE_SZ};
use crate::repository::{Key, Value, KEY_SIZE};
use crate::tenant::blob_io::BlobWriter;
use crate::tenant::block_io::{BlockBuf, BlockReader, FileBlockReader};
use crate::tenant::disk_btree::{DiskBtreeBuilder, DiskBtreeReader, VisitDirection};
use crate::tenant::disk_btree::{
DiskBtreeBuilder, DiskBtreeIterator, DiskBtreeReader, VisitDirection,
};
use crate::tenant::storage_layer::{
LayerAccessStats, ValueReconstructResult, ValueReconstructState,
};
use crate::tenant::timeline::GetVectoredError;
use crate::tenant::vectored_blob_io::{
BlobFlag, MaxVectoredReadBytes, VectoredBlobReader, VectoredRead, VectoredReadPlanner,
BlobFlag, MaxVectoredReadBytes, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
VectoredReadPlanner,
};
use crate::tenant::{PageReconstructError, Timeline};
use crate::virtual_file::{self, VirtualFile};
@@ -46,10 +49,10 @@ use camino::{Utf8Path, Utf8PathBuf};
use hex;
use itertools::Itertools;
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::LayerAccessKind;
use pageserver_api::shard::{ShardIdentity, TenantShardId};
use rand::{distributions::Alphanumeric, Rng};
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use std::fs::File;
use std::io::SeekFrom;
use std::ops::Range;
@@ -224,7 +227,7 @@ impl ImageLayer {
return Ok(());
}
let inner = self.load(LayerAccessKind::Dump, ctx).await?;
let inner = self.load(ctx).await?;
inner.dump(ctx).await?;
@@ -251,12 +254,8 @@ impl ImageLayer {
/// Open the underlying file and read the metadata into memory, if it's
/// not loaded already.
///
async fn load(
&self,
access_kind: LayerAccessKind,
ctx: &RequestContext,
) -> Result<&ImageLayerInner> {
self.access_stats.record_access(access_kind, ctx);
async fn load(&self, ctx: &RequestContext) -> Result<&ImageLayerInner> {
self.access_stats.record_access(ctx);
self.inner
.get_or_try_init(|| self.load_inner(ctx))
.await
@@ -308,7 +307,7 @@ impl ImageLayer {
metadata.len(),
), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
lsn: summary.lsn,
access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
access_stats: Default::default(),
inner: OnceCell::new(),
})
}
@@ -369,12 +368,10 @@ impl ImageLayer {
}
impl ImageLayerInner {
#[cfg(test)]
pub(crate) fn key_range(&self) -> &Range<Key> {
&self.key_range
}
#[cfg(test)]
pub(crate) fn lsn(&self) -> Lsn {
self.lsn
}
@@ -699,7 +696,6 @@ impl ImageLayerInner {
}
}
#[cfg(test)]
pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> ImageLayerIterator<'a> {
let block_reader = FileBlockReader::new(&self.file, self.file_id);
let tree_reader =
@@ -708,9 +704,9 @@ impl ImageLayerInner {
image_layer: self,
ctx,
index_iter: tree_reader.iter(&[0; KEY_SIZE], ctx),
key_values_batch: std::collections::VecDeque::new(),
key_values_batch: VecDeque::new(),
is_end: false,
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
planner: StreamingVectoredReadPlanner::new(
1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
1024, // The default value. Unit tests might use a different value
),
@@ -737,6 +733,9 @@ struct ImageLayerWriterInner {
key_range: Range<Key>,
lsn: Lsn,
// Total uncompressed bytes passed into put_image
uncompressed_bytes: u64,
blob_writer: BlobWriter<false>,
tree: DiskBtreeBuilder<BlockBuf, KEY_SIZE>,
}
@@ -792,6 +791,7 @@ impl ImageLayerWriterInner {
lsn,
tree: tree_builder,
blob_writer,
uncompressed_bytes: 0,
};
Ok(writer)
@@ -809,7 +809,12 @@ impl ImageLayerWriterInner {
ctx: &RequestContext,
) -> anyhow::Result<()> {
ensure!(self.key_range.contains(&key));
let (_img, res) = self.blob_writer.write_blob(img, ctx).await;
let compression = self.conf.image_compression;
self.uncompressed_bytes += img.len() as u64;
let (_img, res) = self
.blob_writer
.write_blob_maybe_compressed(img, ctx, compression)
.await;
// TODO: re-use the buffer for `img` further upstack
let off = res?;
@@ -831,6 +836,11 @@ impl ImageLayerWriterInner {
let index_start_blk =
((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
// Calculate compression ratio
let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
let mut file = self.blob_writer.into_inner();
// Write out the index
@@ -970,17 +980,15 @@ impl Drop for ImageLayerWriter {
}
}
#[cfg(test)]
pub struct ImageLayerIterator<'a> {
image_layer: &'a ImageLayerInner,
ctx: &'a RequestContext,
planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
planner: StreamingVectoredReadPlanner,
index_iter: DiskBtreeIterator<'a>,
key_values_batch: VecDeque<(Key, Lsn, Value)>,
is_end: bool,
}
#[cfg(test)]
impl<'a> ImageLayerIterator<'a> {
/// Retrieve a batch of key-value pairs into the iterator buffer.
async fn next_batch(&mut self) -> anyhow::Result<()> {
@@ -994,14 +1002,17 @@ impl<'a> ImageLayerIterator<'a> {
Key::from_slice(&raw_key[..KEY_SIZE]),
self.image_layer.lsn,
offset,
BlobFlag::None,
) {
break batch_plan;
}
} else {
self.is_end = true;
let payload_end = self.image_layer.index_start_blk as u64 * PAGE_SZ as u64;
break self.planner.handle_range_end(payload_end);
if let Some(item) = self.planner.handle_range_end(payload_end) {
break item;
} else {
return Ok(()); // TODO: a test case on empty iterator
}
}
};
let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
@@ -1095,6 +1106,7 @@ mod test {
ShardIdentity::unsharded(),
get_next_gen(),
)
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let timeline = tenant
@@ -1161,6 +1173,7 @@ mod test {
// But here, all we care about is that the gen number is unique.
get_next_gen(),
)
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let timeline = tenant
@@ -1292,7 +1305,7 @@ mod test {
#[tokio::test]
async fn image_layer_iterator() {
let harness = TenantHarness::create("image_layer_iterator").unwrap();
let harness = TenantHarness::create("image_layer_iterator").await.unwrap();
let (tenant, ctx) = harness.load().await;
let tline = tenant

View File

@@ -18,7 +18,7 @@ use anyhow::{anyhow, ensure, Result};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::InMemoryLayerInfo;
use pageserver_api::shard::TenantShardId;
use std::collections::{BTreeMap, BinaryHeap, HashSet};
use std::collections::BTreeMap;
use std::sync::{Arc, OnceLock};
use std::time::Instant;
use tracing::*;
@@ -375,15 +375,6 @@ impl InMemoryLayer {
let inner = self.inner.read().await;
let reader = inner.file.block_cursor();
#[derive(Eq, PartialEq, Ord, PartialOrd)]
struct BlockRead {
key: Key,
lsn: Lsn,
block_offset: u64,
}
let mut planned_block_reads = BinaryHeap::new();
for range in keyspace.ranges.iter() {
for (key, vec_map) in inner.index.range(range.start..range.end) {
let lsn_range = match reconstruct_state.get_cached_lsn(key) {
@@ -392,49 +383,32 @@ impl InMemoryLayer {
};
let slice = vec_map.slice_range(lsn_range);
for (entry_lsn, pos) in slice.iter().rev() {
planned_block_reads.push(BlockRead {
key: *key,
lsn: *entry_lsn,
block_offset: *pos,
});
// TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
let buf = reader.read_blob(*pos, &ctx).await;
if let Err(e) = buf {
reconstruct_state
.on_key_error(*key, PageReconstructError::from(anyhow!(e)));
break;
}
let value = Value::des(&buf.unwrap());
if let Err(e) = value {
reconstruct_state
.on_key_error(*key, PageReconstructError::from(anyhow!(e)));
break;
}
let key_situation =
reconstruct_state.update_key(key, *entry_lsn, value.unwrap());
if key_situation == ValueReconstructSituation::Complete {
break;
}
}
}
}
let keyspace_size = keyspace.total_raw_size();
let mut completed_keys = HashSet::new();
while completed_keys.len() < keyspace_size && !planned_block_reads.is_empty() {
let block_read = planned_block_reads.pop().unwrap();
if completed_keys.contains(&block_read.key) {
continue;
}
// TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
let buf = reader.read_blob(block_read.block_offset, &ctx).await;
if let Err(e) = buf {
reconstruct_state
.on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
completed_keys.insert(block_read.key);
continue;
}
let value = Value::des(&buf.unwrap());
if let Err(e) = value {
reconstruct_state
.on_key_error(block_read.key, PageReconstructError::from(anyhow!(e)));
completed_keys.insert(block_read.key);
continue;
}
let key_situation =
reconstruct_state.update_key(&block_read.key, block_read.lsn, value.unwrap());
if key_situation == ValueReconstructSituation::Complete {
completed_keys.insert(block_read.key);
}
}
reconstruct_state.on_lsn_advanced(&keyspace, self.start_lsn);
Ok(())

View File

@@ -1,9 +1,7 @@
use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use pageserver_api::keyspace::KeySpace;
use pageserver_api::models::{
HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
};
use pageserver_api::models::HistoricLayerInfo;
use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
use std::ops::Range;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
@@ -160,13 +158,10 @@ impl Layer {
metadata.file_size,
);
let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);
let owner = Layer(Arc::new(LayerInner::new(
conf,
timeline,
local_path,
access_stats,
desc,
None,
metadata.generation,
@@ -193,8 +188,6 @@ impl Layer {
metadata.file_size,
);
let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);
let mut resident = None;
let owner = Layer(Arc::new_cyclic(|owner| {
@@ -209,7 +202,6 @@ impl Layer {
conf,
timeline,
local_path,
access_stats,
desc,
Some(inner),
metadata.generation,
@@ -245,11 +237,6 @@ impl Layer {
version: 0,
});
resident = Some(inner.clone());
let access_stats = LayerAccessStats::empty_will_record_residence_event_later();
access_stats.record_residence_event(
LayerResidenceStatus::Resident,
LayerResidenceEventReason::LayerCreate,
);
let local_path = local_layer_path(
conf,
@@ -259,16 +246,22 @@ impl Layer {
&timeline.generation,
);
LayerInner::new(
let layer = LayerInner::new(
conf,
timeline,
local_path,
access_stats,
desc,
Some(inner),
timeline.generation,
timeline.get_shard_index(),
)
);
// Newly created layers are marked visible by default: the usual case is that they were created to be read.
layer
.access_stats
.set_visibility(super::LayerVisibilityHint::Visible);
layer
}));
let downloaded = resident.expect("just initialized");
@@ -332,9 +325,7 @@ impl Layer {
use anyhow::ensure;
let layer = self.0.get_or_maybe_download(true, Some(ctx)).await?;
self.0
.access_stats
.record_access(LayerAccessKind::GetValueReconstructData, ctx);
self.0.access_stats.record_access(ctx);
if self.layer_desc().is_delta {
ensure!(lsn_range.start >= self.layer_desc().lsn_range.start);
@@ -368,9 +359,7 @@ impl Layer {
other => GetVectoredError::Other(anyhow::anyhow!(other)),
})?;
self.0
.access_stats
.record_access(LayerAccessKind::GetValueReconstructData, ctx);
self.0.access_stats.record_access(ctx);
layer
.get_values_reconstruct_data(keyspace, lsn_range, reconstruct_data, &self.0, ctx)
@@ -385,6 +374,7 @@ impl Layer {
}
/// Get all key/values in the layer. Should be replaced with an iterator-based API in the future.
#[allow(dead_code)]
pub(crate) async fn load_key_values(
&self,
ctx: &RequestContext,
@@ -693,6 +683,18 @@ impl Drop for LayerInner {
// and we could be delaying shutdown for nothing.
}
if let Some(timeline) = self.timeline.upgrade() {
// Only need to decrement metrics if the timeline still exists: otherwise
// it will have already de-registered these metrics via TimelineMetrics::shutdown
if self.desc.is_delta() {
timeline.metrics.layer_count_delta.dec();
timeline.metrics.layer_size_delta.sub(self.desc.file_size);
} else {
timeline.metrics.layer_count_image.dec();
timeline.metrics.layer_size_image.sub(self.desc.file_size);
}
}
if !*self.wanted_deleted.get_mut() {
return;
}
@@ -773,7 +775,6 @@ impl LayerInner {
conf: &'static PageServerConf,
timeline: &Arc<Timeline>,
local_path: Utf8PathBuf,
access_stats: LayerAccessStats,
desc: PersistentLayerDesc,
downloaded: Option<Arc<DownloadedLayer>>,
generation: Generation,
@@ -791,6 +792,15 @@ impl LayerInner {
(heavier_once_cell::OnceCell::default(), 0, Status::Evicted)
};
// This object acts as a RAII guard on these metrics: increment on construction
if desc.is_delta() {
timeline.metrics.layer_count_delta.inc();
timeline.metrics.layer_size_delta.add(desc.file_size);
} else {
timeline.metrics.layer_count_image.inc();
timeline.metrics.layer_size_image.add(desc.file_size);
}
LayerInner {
conf,
debug_str: {
@@ -799,7 +809,7 @@ impl LayerInner {
path: local_path,
desc,
timeline: Arc::downgrade(timeline),
access_stats,
access_stats: Default::default(),
wanted_deleted: AtomicBool::new(false),
inner,
version: AtomicUsize::new(version),
@@ -1154,10 +1164,7 @@ impl LayerInner {
LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
}
self.access_stats.record_residence_event(
LayerResidenceStatus::Resident,
LayerResidenceEventReason::ResidenceChange,
);
self.access_stats.record_residence_event();
Ok(self.initialize_after_layer_is_on_disk(permit))
}
@@ -1276,7 +1283,7 @@ impl LayerInner {
lsn_end: lsn_range.end,
remote: !resident,
access_stats,
l0: crate::tenant::layer_map::LayerMap::is_l0(self.layer_desc()),
l0: crate::tenant::layer_map::LayerMap::is_l0(&self.layer_desc().key_range),
}
} else {
let lsn = self.desc.image_layer_lsn();
@@ -1469,14 +1476,22 @@ impl LayerInner {
let duration = SystemTime::now().duration_since(local_layer_mtime);
match duration {
Ok(elapsed) => {
timeline
.metrics
.evictions_with_low_residence_duration
.read()
.unwrap()
.observe(elapsed);
let accessed = self.access_stats.accessed();
if accessed {
// Only layers used for reads contribute to our "low residence" metric that is used
// to detect thrashing. Layers promoted for other reasons (e.g. compaction) are allowed
// to be rapidly evicted without contributing to this metric.
timeline
.metrics
.evictions_with_low_residence_duration
.read()
.unwrap()
.observe(elapsed);
}
tracing::info!(
residence_millis = elapsed.as_millis(),
accessed,
"evicted layer after known residence period"
);
}
@@ -1503,10 +1518,7 @@ impl LayerInner {
}
}
self.access_stats.record_residence_event(
LayerResidenceStatus::Evicted,
LayerResidenceEventReason::ResidenceChange,
);
self.access_stats.record_residence_event();
self.status.as_ref().unwrap().send_replace(Status::Evicted);
@@ -1832,9 +1844,7 @@ impl ResidentLayer {
// this is valid because the DownloadedLayer::kind is a OnceCell, not a
// Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
// while it's being held.
owner
.access_stats
.record_access(LayerAccessKind::KeyIter, ctx);
owner.access_stats.record_access(ctx);
delta_layer::DeltaLayerInner::load_keys(d, ctx)
.await
@@ -1889,7 +1899,7 @@ impl ResidentLayer {
self.owner.metadata()
}
#[cfg(test)]
/// Cast the layer to a delta, return an error if it is an image layer.
pub(crate) async fn get_as_delta(
&self,
ctx: &RequestContext,
@@ -1901,7 +1911,7 @@ impl ResidentLayer {
}
}
#[cfg(test)]
/// Cast the layer to an image, return an error if it is a delta layer.
pub(crate) async fn get_as_image(
&self,
ctx: &RequestContext,

View File

@@ -1,3 +1,5 @@
use std::time::UNIX_EPOCH;
use pageserver_api::key::CONTROLFILE_KEY;
use tokio::task::JoinSet;
use utils::{
@@ -7,7 +9,7 @@ use utils::{
use super::failpoints::{Failpoint, FailpointKind};
use super::*;
use crate::context::DownloadBehavior;
use crate::{context::DownloadBehavior, tenant::storage_layer::LayerVisibilityHint};
use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
/// Used in tests to advance a future to wanted await point, and not futher.
@@ -22,7 +24,7 @@ const FOREVER: std::time::Duration = std::time::Duration::from_secs(ADVANCE.as_s
async fn smoke_test() {
let handle = tokio::runtime::Handle::current();
let h = TenantHarness::create("smoke_test").unwrap();
let h = TenantHarness::create("smoke_test").await.unwrap();
let span = h.span();
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
let (tenant, _) = h.load().await;
@@ -176,7 +178,9 @@ async fn evict_and_wait_on_wanted_deleted() {
// this is the runtime on which Layer spawns the blocking tasks on
let handle = tokio::runtime::Handle::current();
let h = TenantHarness::create("evict_and_wait_on_wanted_deleted").unwrap();
let h = TenantHarness::create("evict_and_wait_on_wanted_deleted")
.await
.unwrap();
utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
let (tenant, ctx) = h.load().await;
@@ -258,7 +262,9 @@ fn read_wins_pending_eviction() {
rt.block_on(async move {
// this is the runtime on which Layer spawns the blocking tasks on
let handle = tokio::runtime::Handle::current();
let h = TenantHarness::create("read_wins_pending_eviction").unwrap();
let h = TenantHarness::create("read_wins_pending_eviction")
.await
.unwrap();
let (tenant, ctx) = h.load().await;
let span = h.span();
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -390,7 +396,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
rt.block_on(async move {
// this is the runtime on which Layer spawns the blocking tasks on
let handle = tokio::runtime::Handle::current();
let h = TenantHarness::create(name).unwrap();
let h = TenantHarness::create(name).await.unwrap();
let (tenant, ctx) = h.load().await;
let span = h.span();
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -559,8 +565,9 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
#[tokio::test(start_paused = true)]
async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
let handle = tokio::runtime::Handle::current();
let h =
TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction").unwrap();
let h = TenantHarness::create("cancelled_get_or_maybe_download_does_not_cancel_eviction")
.await
.unwrap();
let (tenant, ctx) = h.load().await;
let timeline = tenant
@@ -636,7 +643,9 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
#[tokio::test(start_paused = true)]
async fn evict_and_wait_does_not_wait_for_download() {
// let handle = tokio::runtime::Handle::current();
let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download").unwrap();
let h = TenantHarness::create("evict_and_wait_does_not_wait_for_download")
.await
.unwrap();
let (tenant, ctx) = h.load().await;
let span = h.span();
let download_span = span.in_scope(|| tracing::info_span!("downloading", timeline_id = 1));
@@ -733,7 +742,9 @@ async fn eviction_cancellation_on_drop() {
// this is the runtime on which Layer spawns the blocking tasks on
let handle = tokio::runtime::Handle::current();
let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap();
let h = TenantHarness::create("eviction_cancellation_on_drop")
.await
.unwrap();
utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
let (tenant, ctx) = h.load().await;
@@ -817,9 +828,9 @@ async fn eviction_cancellation_on_drop() {
#[test]
#[cfg(target_arch = "x86_64")]
fn layer_size() {
assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
assert_eq!(std::mem::size_of::<LayerAccessStats>(), 8);
assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
assert_eq!(std::mem::size_of::<LayerInner>(), 2344);
assert_eq!(std::mem::size_of::<LayerInner>(), 312);
// it also has the utf8 path
}
@@ -959,3 +970,46 @@ fn spawn_blocking_pool_helper_actually_works() {
println!("joined");
});
}
/// Drop the low bits from a time, to emulate the precision loss in LayerAccessStats
fn lowres_time(hires: SystemTime) -> SystemTime {
let ts = hires.duration_since(UNIX_EPOCH).unwrap().as_secs();
UNIX_EPOCH + Duration::from_secs(ts)
}
#[test]
fn access_stats() {
let access_stats = LayerAccessStats::default();
// Default is visible
assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible);
access_stats.set_visibility(LayerVisibilityHint::Covered);
assert_eq!(access_stats.visibility(), LayerVisibilityHint::Covered);
access_stats.set_visibility(LayerVisibilityHint::Visible);
assert_eq!(access_stats.visibility(), LayerVisibilityHint::Visible);
let rtime = UNIX_EPOCH + Duration::from_secs(2000000000);
access_stats.record_residence_event_at(rtime);
assert_eq!(access_stats.latest_activity(), lowres_time(rtime));
let atime = UNIX_EPOCH + Duration::from_secs(2100000000);
access_stats.record_access_at(atime);
assert_eq!(access_stats.latest_activity(), lowres_time(atime));
// Setting visibility doesn't clobber access time
access_stats.set_visibility(LayerVisibilityHint::Covered);
assert_eq!(access_stats.latest_activity(), lowres_time(atime));
access_stats.set_visibility(LayerVisibilityHint::Visible);
assert_eq!(access_stats.latest_activity(), lowres_time(atime));
}
#[test]
fn access_stats_2038() {
// The access stats structure uses a timestamp representation that will run out
// of bits in 2038. One year before that, this unit test will start failing.
let one_year_from_now = SystemTime::now().duration_since(UNIX_EPOCH).unwrap()
+ Duration::from_secs(3600 * 24 * 365);
assert!(one_year_from_now.as_secs() < (2 << 31));
}

View File

@@ -25,7 +25,7 @@ pub struct PersistentLayerDesc {
///
/// - For an open in-memory layer, the end bound is MAX_LSN
/// - For a frozen in-memory layer or a delta layer, the end bound is a valid lsn after the
/// range start
/// range start
/// - An image layer represents snapshot at one LSN, so end_lsn is always the snapshot LSN + 1
pub lsn_range: Range<Lsn>,
/// Whether this is a delta layer, and also, is this incremental.

View File

@@ -248,6 +248,14 @@ impl LayerName {
Image(_) => "image",
}
}
/// Gets the key range encoded in the layer name.
pub fn key_range(&self) -> &Range<Key> {
match &self {
LayerName::Image(layer) => &layer.key_range,
LayerName::Delta(layer) => &layer.key_range,
}
}
}
impl fmt::Display for LayerName {

View File

@@ -96,15 +96,22 @@ impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
use std::cmp::Ordering;
let a = self.peek_next_key_lsn();
let b = other.peek_next_key_lsn();
let a = self.peek_next_key_lsn_value();
let b = other.peek_next_key_lsn_value();
match (a, b) {
(Some((k1, l1)), Some((k2, l2))) => {
let loaded_1 = if self.is_loaded() { 1 } else { 0 };
let loaded_2 = if other.is_loaded() { 1 } else { 0 };
(Some((k1, l1, v1)), Some((k2, l2, v2))) => {
fn map_value_to_num(val: &Option<&Value>) -> usize {
match val {
None => 0,
Some(Value::Image(_)) => 1,
Some(Value::WalRecord(_)) => 2,
}
}
let order_1 = map_value_to_num(&v1);
let order_2 = map_value_to_num(&v2);
// When key_lsn are the same, the unloaded iter will always appear before the loaded one.
// And note that we do a reverse at the end of the comparison, so it works with the max heap.
(k1, l1, loaded_1).cmp(&(k2, l2, loaded_2))
(k1, l1, order_1).cmp(&(k2, l2, order_2))
}
(Some(_), None) => Ordering::Less,
(None, Some(_)) => Ordering::Greater,
@@ -137,13 +144,16 @@ impl<'a> IteratorWrapper<'a> {
}
}
fn peek_next_key_lsn(&self) -> Option<(&Key, Lsn)> {
fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> {
match self {
Self::Loaded { iter } => iter.peek().as_ref().map(|(key, lsn, _)| (key, *lsn)),
Self::Loaded { iter } => iter
.peek()
.as_ref()
.map(|(key, lsn, val)| (key, *lsn, Some(val))),
Self::NotLoaded {
first_key_lower_bound: (key, lsn),
..
} => Some((key, *lsn)),
} => Some((key, *lsn, None)),
}
}
@@ -191,6 +201,13 @@ impl<'a> IteratorWrapper<'a> {
}
}
/// A merge iterator over delta/image layer iterators. When duplicated records are
/// found, the iterator will not perform any deduplication, and the caller should handle
/// these situation. By saying duplicated records, there are many possibilities:
/// * Two same delta at the same LSN.
/// * Two same image at the same LSN.
/// * Delta/image at the same LSN where the image has already applied the delta.
/// The iterator will always put the image before the delta.
pub struct MergeIterator<'a> {
heap: BinaryHeap<IteratorWrapper<'a>>,
}
@@ -245,8 +262,9 @@ mod tests {
use crate::{
tenant::{
harness::{TenantHarness, TIMELINE_ID},
storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value},
},
walrecord::NeonWalRecord,
DEFAULT_PG_VERSION,
};
@@ -275,7 +293,9 @@ mod tests {
use crate::repository::Value;
use bytes::Bytes;
let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
let harness = TenantHarness::create("merge_iterator_merge_in_between")
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let tline = tenant
@@ -338,7 +358,9 @@ mod tests {
use crate::repository::Value;
use bytes::Bytes;
let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
let harness = TenantHarness::create("merge_iterator_delta_merge")
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let tline = tenant
@@ -407,6 +429,133 @@ mod tests {
// TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
}
// TODO: image layer merge, delta+image mixed merge
// TODO: is it possible to have duplicated delta at same LSN now? we might need to test that
#[tokio::test]
async fn delta_image_mixed_merge() {
use crate::repository::Value;
use bytes::Bytes;
let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge")
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let tline = tenant
.create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
.await
.unwrap();
fn get_key(id: u32) -> Key {
let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
key.field6 = id;
key
}
// In this test case, we want to test if the iterator still works correctly with multiple copies
// of a delta+image at the same LSN, for example, the following sequence a@10=+a, a@10=+a, a@10=ab, a@10=ab.
// Duplicated deltas/images are possible for old tenants before the full L0 compaction file name fix.
// An incomplete compaction could produce multiple exactly-the-same delta layers. Force image generation
// could produce overlapping images. Apart from duplicated deltas/images, in the current storage implementation
// one key-lsn could have a delta in the delta layer and one image in the image layer. The iterator should
// correctly process these situations and return everything as-is, and the upper layer of the system
// will handle duplicated LSNs.
let test_deltas1 = vec![
(
get_key(0),
Lsn(0x10),
Value::WalRecord(NeonWalRecord::wal_init()),
),
(
get_key(0),
Lsn(0x18),
Value::WalRecord(NeonWalRecord::wal_append("a")),
),
(
get_key(5),
Lsn(0x10),
Value::WalRecord(NeonWalRecord::wal_init()),
),
(
get_key(5),
Lsn(0x18),
Value::WalRecord(NeonWalRecord::wal_append("b")),
),
];
let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
.await
.unwrap();
let mut test_deltas2 = test_deltas1.clone();
test_deltas2.push((
get_key(10),
Lsn(0x20),
Value::Image(Bytes::copy_from_slice(b"test")),
));
let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
.await
.unwrap();
let test_deltas3 = vec![
(
get_key(0),
Lsn(0x10),
Value::Image(Bytes::copy_from_slice(b"")),
),
(
get_key(5),
Lsn(0x18),
Value::Image(Bytes::copy_from_slice(b"b")),
),
(
get_key(15),
Lsn(0x20),
Value::Image(Bytes::copy_from_slice(b"test")),
),
];
let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
.await
.unwrap();
let mut test_deltas4 = test_deltas3.clone();
test_deltas4.push((
get_key(20),
Lsn(0x20),
Value::Image(Bytes::copy_from_slice(b"test")),
));
let resident_layer_4 = produce_delta_layer(&tenant, &tline, test_deltas4.clone(), &ctx)
.await
.unwrap();
let mut expect = Vec::new();
expect.extend(test_deltas1);
expect.extend(test_deltas2);
expect.extend(test_deltas3);
expect.extend(test_deltas4);
expect.sort_by(sort_delta_value);
// Test with different layer order for MergeIterator::create to ensure the order
// is stable.
let mut merge_iter = MergeIterator::create(
&[
resident_layer_4.get_as_delta(&ctx).await.unwrap(),
resident_layer_1.get_as_delta(&ctx).await.unwrap(),
resident_layer_3.get_as_delta(&ctx).await.unwrap(),
resident_layer_2.get_as_delta(&ctx).await.unwrap(),
],
&[],
&ctx,
);
assert_merge_iter_equal(&mut merge_iter, &expect).await;
let mut merge_iter = MergeIterator::create(
&[
resident_layer_1.get_as_delta(&ctx).await.unwrap(),
resident_layer_4.get_as_delta(&ctx).await.unwrap(),
resident_layer_3.get_as_delta(&ctx).await.unwrap(),
resident_layer_2.get_as_delta(&ctx).await.unwrap(),
],
&[],
&ctx,
);
assert_merge_iter_equal(&mut merge_iter, &expect).await;
is_send(merge_iter);
}
fn is_send(_: impl Send) {}
}

View File

@@ -101,7 +101,6 @@ pub fn start_background_loops(
Some(tenant_shard_id),
None,
&format!("compactor for tenant {tenant_shard_id}"),
false,
{
let tenant = Arc::clone(tenant);
let background_jobs_can_start = background_jobs_can_start.cloned();
@@ -125,7 +124,6 @@ pub fn start_background_loops(
Some(tenant_shard_id),
None,
&format!("garbage collector for tenant {tenant_shard_id}"),
false,
{
let tenant = Arc::clone(tenant);
let background_jobs_can_start = background_jobs_can_start.cloned();
@@ -149,7 +147,6 @@ pub fn start_background_loops(
Some(tenant_shard_id),
None,
&format!("ingest housekeeping for tenant {tenant_shard_id}"),
false,
{
let tenant = Arc::clone(tenant);
let background_jobs_can_start = background_jobs_can_start.cloned();

View File

@@ -1,5 +1,5 @@
pub(crate) mod analysis;
mod compaction;
pub(crate) mod compaction;
pub mod delete;
pub(crate) mod detach_ancestor;
mod eviction_task;
@@ -69,6 +69,7 @@ use std::{
use crate::{
aux_file::AuxFileSizeEstimator,
tenant::{
config::defaults::DEFAULT_PITR_INTERVAL,
layer_map::{LayerMap, SearchResult},
metadata::TimelineMetadata,
storage_layer::PersistentLayerDesc,
@@ -197,7 +198,7 @@ impl PartialOrd for Hole {
/// Temporary function for immutable storage state refactor, ensures we are dropping mutex guard instead of other things.
/// Can be removed after all refactors are done.
fn drop_rlock<T>(rlock: tokio::sync::OwnedRwLockReadGuard<T>) {
fn drop_rlock<T>(rlock: tokio::sync::RwLockReadGuard<T>) {
drop(rlock)
}
@@ -270,7 +271,7 @@ pub struct Timeline {
///
/// In the future, we'll be able to split up the tuple of LayerMap and `LayerFileManager`,
/// so that e.g. on-demand-download/eviction, and layer spreading, can operate just on `LayerFileManager`.
pub(crate) layers: Arc<tokio::sync::RwLock<LayerManager>>,
pub(crate) layers: tokio::sync::RwLock<LayerManager>,
last_freeze_at: AtomicLsn,
// Atomic would be more appropriate here.
@@ -459,7 +460,7 @@ pub(crate) struct GcInfo {
/// Currently, this includes all points where child branches have
/// been forked off from. In the future, could also include
/// explicit user-defined snapshot points.
pub(crate) retain_lsns: Vec<Lsn>,
pub(crate) retain_lsns: Vec<(Lsn, TimelineId)>,
/// The cutoff coordinates, which are combined by selecting the minimum.
pub(crate) cutoffs: GcCutoffs,
@@ -475,39 +476,43 @@ impl GcInfo {
pub(crate) fn min_cutoff(&self) -> Lsn {
self.cutoffs.select_min()
}
pub(super) fn insert_child(&mut self, child_id: TimelineId, child_lsn: Lsn) {
self.retain_lsns.push((child_lsn, child_id));
self.retain_lsns.sort_by_key(|i| i.0);
}
pub(super) fn remove_child(&mut self, child_id: TimelineId) {
self.retain_lsns.retain(|i| i.1 != child_id);
}
}
/// The `GcInfo` component describing which Lsns need to be retained.
#[derive(Debug)]
/// The `GcInfo` component describing which Lsns need to be retained. Functionally, this
/// is a single number (the oldest LSN which we must retain), but it internally distinguishes
/// between time-based and space-based retention for observability and consumption metrics purposes.
#[derive(Debug, Clone)]
pub(crate) struct GcCutoffs {
/// Keep everything newer than this point.
///
/// This is calculated by subtracting 'gc_horizon' setting from
/// last-record LSN
///
/// FIXME: is this inclusive or exclusive?
pub(crate) horizon: Lsn,
/// Calculated from the [`TenantConf::gc_horizon`], this LSN indicates how much
/// history we must keep to retain a specified number of bytes of WAL.
pub(crate) space: Lsn,
/// In addition to 'retain_lsns' and 'horizon_cutoff', keep everything newer than this
/// point.
///
/// This is calculated by finding a number such that a record is needed for PITR
/// if only if its LSN is larger than 'pitr_cutoff'.
pub(crate) pitr: Lsn,
/// Calculated from [`TenantConf::pitr_interval`], this LSN indicates how much
/// history we must keep to enable reading back at least the PITR interval duration.
pub(crate) time: Lsn,
}
impl Default for GcCutoffs {
fn default() -> Self {
Self {
horizon: Lsn::INVALID,
pitr: Lsn::INVALID,
space: Lsn::INVALID,
time: Lsn::INVALID,
}
}
}
impl GcCutoffs {
fn select_min(&self) -> Lsn {
std::cmp::min(self.horizon, self.pitr)
std::cmp::min(self.space, self.time)
}
}
@@ -866,7 +871,7 @@ impl Timeline {
let gc_info = self.gc_info.read().unwrap();
let history = self
.get_last_record_lsn()
.checked_sub(gc_info.cutoffs.pitr)
.checked_sub(gc_info.cutoffs.time)
.unwrap_or(Lsn(0))
.0;
(history, gc_info.within_ancestor_pitr)
@@ -1565,7 +1570,7 @@ impl Timeline {
) -> anyhow::Result<()> {
ensure!(
lsn >= **latest_gc_cutoff_lsn,
"LSN {} is earlier than latest GC horizon {} (we might've already garbage collected needed data)",
"LSN {} is earlier than latest GC cutoff {} (we might've already garbage collected needed data)",
lsn,
**latest_gc_cutoff_lsn,
);
@@ -2311,6 +2316,11 @@ impl Timeline {
)
};
if let Some(ancestor) = &ancestor {
let mut ancestor_gc_info = ancestor.gc_info.write().unwrap();
ancestor_gc_info.insert_child(timeline_id, metadata.ancestor_lsn());
}
Arc::new_cyclic(|myself| {
let metrics = TimelineMetrics::new(
&tenant_shard_id,
@@ -2481,7 +2491,6 @@ impl Timeline {
Some(self.tenant_shard_id),
Some(self.timeline_id),
"layer flush task",
false,
async move {
let _guard = guard;
let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
@@ -2826,7 +2835,6 @@ impl Timeline {
Some(self.tenant_shard_id),
Some(self.timeline_id),
"initial size calculation",
false,
// NB: don't log errors here, task_mgr will do that.
async move {
let cancel = task_mgr::shutdown_token();
@@ -2995,7 +3003,6 @@ impl Timeline {
Some(self.tenant_shard_id),
Some(self.timeline_id),
"ondemand logical size calculation",
false,
async move {
let res = self_clone
.logical_size_calculation_task(lsn, cause, &ctx)
@@ -3162,7 +3169,7 @@ impl Timeline {
let guard = self.layers.read().await;
let resident = guard.likely_resident_layers().map(|layer| {
let last_activity_ts = layer.access_stats().latest_activity_or_now();
let last_activity_ts = layer.access_stats().latest_activity();
HeatMapLayer::new(
layer.layer_desc().layer_name(),
@@ -3408,6 +3415,8 @@ impl Timeline {
}
}
#[allow(unknown_lints)] // doc_lazy_continuation is still a new lint
#[allow(clippy::doc_lazy_continuation)]
/// Get the data needed to reconstruct all keys in the provided keyspace
///
/// The algorithm is as follows:
@@ -4474,10 +4483,10 @@ impl Timeline {
/// are required. Since checking if new image layers are required is expensive in
/// terms of CPU, we only do it in the following cases:
/// 1. If the timeline has ingested sufficient WAL to justify the cost
/// 2. If enough time has passed since the last check
/// 2.1. For large tenants, we wish to perform the check more often since they
/// suffer from the lack of image layers
/// 2.2. For small tenants (that can mostly fit in RAM), we use a much longer interval
/// 2. If enough time has passed since the last check:
/// 1. For large tenants, we wish to perform the check more often since they
/// suffer from the lack of image layers
/// 2. For small tenants (that can mostly fit in RAM), we use a much longer interval
fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;
@@ -4719,7 +4728,7 @@ impl Timeline {
/// Requires a timeline that:
/// - has an ancestor to detach from
/// - the ancestor does not have an ancestor -- follows from the original RFC limitations, not
/// a technical requirement
/// a technical requirement
///
/// After the operation has been started, it cannot be canceled. Upon restart it needs to be
/// polled again until completion.
@@ -4731,13 +4740,7 @@ impl Timeline {
tenant: &crate::tenant::Tenant,
options: detach_ancestor::Options,
ctx: &RequestContext,
) -> Result<
(
completion::Completion,
detach_ancestor::PreparedTimelineDetach,
),
detach_ancestor::Error,
> {
) -> Result<detach_ancestor::Progress, detach_ancestor::Error> {
detach_ancestor::prepare(self, tenant, options, ctx).await
}
@@ -4764,6 +4767,18 @@ impl Timeline {
}
}
impl Drop for Timeline {
fn drop(&mut self) {
if let Some(ancestor) = &self.ancestor_timeline {
// This lock should never be poisoned, but in case it is we do a .map() instead of
// an unwrap(), to avoid panicking in a destructor and thereby aborting the process.
if let Ok(mut gc_info) = ancestor.gc_info.write() {
gc_info.remove_child(self.timeline_id)
}
}
}
}
/// Top-level failure to compact.
#[derive(Debug, thiserror::Error)]
pub(crate) enum CompactionError {
@@ -4876,7 +4891,7 @@ impl Timeline {
// for compact_level0_phase1 creating an L0, which does not happen in practice
// because we have not implemented L0 => L0 compaction.
duplicated_layers.insert(l.layer_desc().key());
} else if LayerMap::is_l0(l.layer_desc()) {
} else if LayerMap::is_l0(&l.layer_desc().key_range) {
bail!("compaction generates a L0 layer file as output, which will cause infinite compaction.");
} else {
insert_layers.push(l.clone());
@@ -4944,24 +4959,21 @@ impl Timeline {
}
/// Find the Lsns above which layer files need to be retained on
/// garbage collection. This is separate from actually performing the GC,
/// and is updated more frequently, so that compaction can remove obsolete
/// page versions more aggressively.
/// garbage collection.
///
/// TODO: that's wishful thinking, compaction doesn't actually do that
/// currently.
/// We calculate two cutoffs, one based on time and one based on WAL size. `pitr`
/// controls the time cutoff (or ZERO to disable time-based retention), and `space_cutoff` controls
/// the space-based retention.
///
/// The 'cutoff_horizon' point is used to retain recent versions that might still be
/// needed by read-only nodes. (As of this writing, the caller just passes
/// the latest LSN subtracted by a constant, and doesn't do anything smart
/// to figure out what read-only nodes might actually need.)
///
/// The 'pitr' duration is used to calculate a 'pitr_cutoff', which can be used to determine
/// whether a record is needed for PITR.
/// This function doesn't simply to calculate time & space based retention: it treats time-based
/// retention as authoritative if enabled, and falls back to space-based retention if calculating
/// the LSN for a time point isn't possible. Therefore the GcCutoffs::horizon in the response might
/// be different to the `space_cutoff` input. Callers should treat the min() of the two cutoffs
/// in the response as the GC cutoff point for the timeline.
#[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
pub(super) async fn find_gc_cutoffs(
&self,
cutoff_horizon: Lsn,
space_cutoff: Lsn,
pitr: Duration,
cancel: &CancellationToken,
ctx: &RequestContext,
@@ -4974,58 +4986,87 @@ impl Timeline {
pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");
// First, calculate pitr_cutoff_timestamp and then convert it to LSN.
//
// Some unit tests depend on garbage-collection working even when
// CLOG data is missing, so that find_lsn_for_timestamp() doesn't
// work, so avoid calling it altogether if time-based retention is not
// configured. It would be pointless anyway.
let pitr_cutoff = if pitr != Duration::ZERO {
let now = SystemTime::now();
if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
match self
.find_lsn_for_timestamp(pitr_timestamp, cancel, ctx)
.await?
{
LsnForTimestamp::Present(lsn) => lsn,
LsnForTimestamp::Future(lsn) => {
// The timestamp is in the future. That sounds impossible,
// but what it really means is that there hasn't been
// any commits since the cutoff timestamp.
//
// In this case we should use the LSN of the most recent commit,
// which is implicitly the last LSN in the log.
debug!("future({})", lsn);
self.get_last_record_lsn()
}
LsnForTimestamp::Past(lsn) => {
debug!("past({})", lsn);
// conservative, safe default is to remove nothing, when we
// have no commit timestamp data available
*self.get_latest_gc_cutoff_lsn()
}
LsnForTimestamp::NoData(lsn) => {
debug!("nodata({})", lsn);
// conservative, safe default is to remove nothing, when we
// have no commit timestamp data available
*self.get_latest_gc_cutoff_lsn()
}
}
} else {
// If we don't have enough data to convert to LSN,
// play safe and don't remove any layers.
*self.get_latest_gc_cutoff_lsn()
if cfg!(test) {
// Unit tests which specify zero PITR interval expect to avoid doing any I/O for timestamp lookup
if pitr == Duration::ZERO {
return Ok(GcCutoffs {
time: self.get_last_record_lsn(),
space: space_cutoff,
});
}
}
// Calculate a time-based limit on how much to retain:
// - if PITR interval is set, then this is our cutoff.
// - if PITR interval is not set, then we do a lookup
// based on DEFAULT_PITR_INTERVAL, so that size-based retention does not result in keeping history around permanently on idle databases.
let time_cutoff = {
let now = SystemTime::now();
let time_range = if pitr == Duration::ZERO {
humantime::parse_duration(DEFAULT_PITR_INTERVAL).expect("constant is invalid")
} else {
pitr
};
// If PITR is so large or `now` is so small that this underflows, we will retain no history (highly unexpected case)
let time_cutoff = now.checked_sub(time_range).unwrap_or(now);
let timestamp = to_pg_timestamp(time_cutoff);
match self.find_lsn_for_timestamp(timestamp, cancel, ctx).await? {
LsnForTimestamp::Present(lsn) => Some(lsn),
LsnForTimestamp::Future(lsn) => {
// The timestamp is in the future. That sounds impossible,
// but what it really means is that there hasn't been
// any commits since the cutoff timestamp.
//
// In this case we should use the LSN of the most recent commit,
// which is implicitly the last LSN in the log.
debug!("future({})", lsn);
Some(self.get_last_record_lsn())
}
LsnForTimestamp::Past(lsn) => {
debug!("past({})", lsn);
None
}
LsnForTimestamp::NoData(lsn) => {
debug!("nodata({})", lsn);
None
}
}
} else {
// No time-based retention was configured. Interpret this as "keep no history".
self.get_last_record_lsn()
};
Ok(GcCutoffs {
horizon: cutoff_horizon,
pitr: pitr_cutoff,
Ok(match (pitr, time_cutoff) {
(Duration::ZERO, Some(time_cutoff)) => {
// PITR is not set. Retain the size-based limit, or the default time retention,
// whichever requires less data.
GcCutoffs {
time: self.get_last_record_lsn(),
space: std::cmp::max(time_cutoff, space_cutoff),
}
}
(Duration::ZERO, None) => {
// PITR is not set, and time lookup failed
GcCutoffs {
time: self.get_last_record_lsn(),
space: space_cutoff,
}
}
(_, None) => {
// PITR interval is set & we didn't look up a timestamp successfully. Conservatively assume PITR
// cannot advance beyond what was already GC'd, and respect space-based retention
GcCutoffs {
time: *self.get_latest_gc_cutoff_lsn(),
space: space_cutoff,
}
}
(_, Some(time_cutoff)) => {
// PITR interval is set and we looked up timestamp successfully. Ignore
// size based retention and make time cutoff authoritative
GcCutoffs {
time: time_cutoff,
space: time_cutoff,
}
}
})
}
@@ -5050,12 +5091,16 @@ impl Timeline {
return Err(GcError::TimelineCancelled);
}
let (horizon_cutoff, pitr_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
let (space_cutoff, time_cutoff, retain_lsns, max_lsn_with_valid_lease) = {
let gc_info = self.gc_info.read().unwrap();
let horizon_cutoff = min(gc_info.cutoffs.horizon, self.get_disk_consistent_lsn());
let pitr_cutoff = gc_info.cutoffs.pitr;
let retain_lsns = gc_info.retain_lsns.clone();
let space_cutoff = min(gc_info.cutoffs.space, self.get_disk_consistent_lsn());
let time_cutoff = gc_info.cutoffs.time;
let retain_lsns = gc_info
.retain_lsns
.iter()
.map(|(lsn, _child_id)| *lsn)
.collect();
// Gets the maximum LSN that holds the valid lease.
//
@@ -5064,14 +5109,14 @@ impl Timeline {
let max_lsn_with_valid_lease = gc_info.leases.last_key_value().map(|(lsn, _)| *lsn);
(
horizon_cutoff,
pitr_cutoff,
space_cutoff,
time_cutoff,
retain_lsns,
max_lsn_with_valid_lease,
)
};
let mut new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
let mut new_gc_cutoff = Lsn::min(space_cutoff, time_cutoff);
let standby_horizon = self.standby_horizon.load();
// Hold GC for the standby, but as a safety guard do it only within some
// reasonable lag.
@@ -5100,8 +5145,8 @@ impl Timeline {
let res = self
.gc_timeline(
horizon_cutoff,
pitr_cutoff,
space_cutoff,
time_cutoff,
retain_lsns,
max_lsn_with_valid_lease,
new_gc_cutoff,
@@ -5119,8 +5164,8 @@ impl Timeline {
async fn gc_timeline(
&self,
horizon_cutoff: Lsn,
pitr_cutoff: Lsn,
space_cutoff: Lsn,
time_cutoff: Lsn,
retain_lsns: Vec<Lsn>,
max_lsn_with_valid_lease: Option<Lsn>,
new_gc_cutoff: Lsn,
@@ -5181,22 +5226,22 @@ impl Timeline {
result.layers_total += 1;
// 1. Is it newer than GC horizon cutoff point?
if l.get_lsn_range().end > horizon_cutoff {
if l.get_lsn_range().end > space_cutoff {
debug!(
"keeping {} because it's newer than horizon_cutoff {}",
"keeping {} because it's newer than space_cutoff {}",
l.layer_name(),
horizon_cutoff,
space_cutoff,
);
result.layers_needed_by_cutoff += 1;
continue 'outer;
}
// 2. It is newer than PiTR cutoff point?
if l.get_lsn_range().end > pitr_cutoff {
if l.get_lsn_range().end > time_cutoff {
debug!(
"keeping {} because it's newer than pitr_cutoff {}",
"keeping {} because it's newer than time_cutoff {}",
l.layer_name(),
pitr_cutoff,
time_cutoff,
);
result.layers_needed_by_pitr += 1;
continue 'outer;
@@ -5417,7 +5462,6 @@ impl Timeline {
Some(self.tenant_shard_id),
Some(self.timeline_id),
"download all remote layers task",
false,
async move {
self_clone.download_all_remote_layers(request).await;
let mut status_guard = self_clone.download_all_remote_layers_task_info.write().unwrap();
@@ -5568,7 +5612,7 @@ impl Timeline {
let file_size = layer.layer_desc().file_size;
max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size)));
let last_activity_ts = layer.access_stats().latest_activity_or_now();
let last_activity_ts = layer.access_stats().latest_activity();
EvictionCandidate {
layer: layer.into(),
@@ -6028,8 +6072,9 @@ mod tests {
#[tokio::test]
async fn two_layer_eviction_attempts_at_the_same_time() {
let harness =
TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();
let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time")
.await
.unwrap();
let (tenant, ctx) = harness.load().await;
let timeline = tenant

View File

@@ -26,15 +26,17 @@ use utils::id::TimelineId;
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
use crate::page_cache;
use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
use crate::tenant::config::defaults::{DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD};
use crate::tenant::storage_layer::merge_iterator::MergeIterator;
use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc, ValueReconstructState};
use crate::tenant::timeline::{drop_rlock, DeltaLayerWriter, ImageLayerWriter};
use crate::tenant::timeline::{Hole, ImageLayerCreationOutcome};
use crate::tenant::timeline::{Layer, ResidentLayer};
use crate::tenant::DeltaLayer;
use crate::virtual_file::{MaybeFatalIo, VirtualFile};
use crate::keyspace::KeySpace;
use crate::repository::Key;
use crate::repository::{Key, Value};
use utils::lsn::Lsn;
@@ -43,6 +45,60 @@ use pageserver_compaction::interface::*;
use super::CompactionError;
/// Maximum number of deltas before generating an image layer in bottom-most compaction.
const COMPACTION_DELTA_THRESHOLD: usize = 5;
/// The result of bottom-most compaction for a single key at each LSN.
#[derive(Debug)]
#[cfg_attr(test, derive(PartialEq))]
pub struct KeyLogAtLsn(pub Vec<(Lsn, Value)>);
/// The result of bottom-most compaction.
#[derive(Debug)]
#[cfg_attr(test, derive(PartialEq))]
pub(crate) struct KeyHistoryRetention {
/// Stores logs to reconstruct the value at the given LSN, that is to say, logs <= LSN or image == LSN.
pub(crate) below_horizon: Vec<(Lsn, KeyLogAtLsn)>,
/// Stores logs to reconstruct the value at any LSN above the horizon, that is to say, log > LSN.
pub(crate) above_horizon: KeyLogAtLsn,
}
impl KeyHistoryRetention {
async fn pipe_to(
self,
key: Key,
delta_writer: &mut Vec<(Key, Lsn, Value)>,
image_writer: &mut ImageLayerWriter,
ctx: &RequestContext,
) -> anyhow::Result<()> {
let mut first_batch = true;
for (_, KeyLogAtLsn(logs)) in self.below_horizon {
if first_batch {
if logs.len() == 1 && logs[0].1.is_image() {
let Value::Image(img) = &logs[0].1 else {
unreachable!()
};
image_writer.put_image(key, img.clone(), ctx).await?;
} else {
for (lsn, val) in logs {
delta_writer.push((key, lsn, val));
}
}
first_batch = false;
} else {
for (lsn, val) in logs {
delta_writer.push((key, lsn, val));
}
}
}
let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
for (lsn, val) in above_horizon_logs {
delta_writer.push((key, lsn, val));
}
Ok(())
}
}
impl Timeline {
/// TODO: cancellation
pub(crate) async fn compact_legacy(
@@ -195,7 +251,7 @@ impl Timeline {
tracing::info!(
"latest_gc_cutoff: {}, pitr cutoff {}",
*latest_gc_cutoff,
self.gc_info.read().unwrap().cutoffs.pitr
self.gc_info.read().unwrap().cutoffs.time
);
let layers = self.layers.read().await;
@@ -379,7 +435,7 @@ impl Timeline {
};
let begin = tokio::time::Instant::now();
let phase1_layers_locked = Arc::clone(&self.layers).read_owned().await;
let phase1_layers_locked = self.layers.read().await;
let now = tokio::time::Instant::now();
stats.read_lock_acquisition_micros =
DurationRecorder::Recorded(RecordedDuration(now - begin), now);
@@ -399,9 +455,9 @@ impl Timeline {
}
/// Level0 files first phase of compaction, explained in the [`Self::compact_legacy`] comment.
async fn compact_level0_phase1(
self: &Arc<Self>,
guard: tokio::sync::OwnedRwLockReadGuard<LayerManager>,
async fn compact_level0_phase1<'a>(
self: &'a Arc<Self>,
guard: tokio::sync::RwLockReadGuard<'a, LayerManager>,
mut stats: CompactLevel0Phase1StatsBuilder,
target_file_size: u64,
ctx: &RequestContext,
@@ -415,6 +471,7 @@ impl Timeline {
.map(|x| guard.get_from_desc(&x))
.collect_vec();
stats.level0_deltas_count = Some(level0_deltas.len());
// Only compact if enough layers have accumulated.
let threshold = self.get_compaction_threshold();
if level0_deltas.is_empty() || level0_deltas.len() < threshold {
@@ -445,6 +502,22 @@ impl Timeline {
let mut prev_lsn_end = first_level0_delta.layer_desc().lsn_range.end;
let mut deltas_to_compact = Vec::with_capacity(level0_deltas.len());
// Accumulate the size of layers in `deltas_to_compact`
let mut deltas_to_compact_bytes = 0;
// Under normal circumstances, we will accumulate up to compaction_interval L0s of size
// checkpoint_distance each. To avoid edge cases using extra system resources, bound our
// work in this function to only operate on this much delta data at once.
//
// Take the max of the configured value & the default, so that tests that configure tiny values
// can still use a sensible amount of memory, but if a deployed system configures bigger values we
// still let them compact a full stack of L0s in one go.
let delta_size_limit = std::cmp::max(
self.get_compaction_threshold(),
DEFAULT_COMPACTION_THRESHOLD,
) as u64
* std::cmp::max(self.get_checkpoint_distance(), DEFAULT_CHECKPOINT_DISTANCE);
deltas_to_compact.push(first_level0_delta.download_and_keep_resident().await?);
for l in level0_deltas_iter {
let lsn_range = &l.layer_desc().lsn_range;
@@ -453,7 +526,20 @@ impl Timeline {
break;
}
deltas_to_compact.push(l.download_and_keep_resident().await?);
deltas_to_compact_bytes += l.metadata().file_size;
prev_lsn_end = lsn_range.end;
if deltas_to_compact_bytes >= delta_size_limit {
info!(
l0_deltas_selected = deltas_to_compact.len(),
l0_deltas_total = level0_deltas.len(),
"L0 compaction picker hit max delta layer size limit: {}",
delta_size_limit
);
// Proceed with compaction, but only a subset of L0s
break;
}
}
let lsn_range = Range {
start: deltas_to_compact
@@ -957,6 +1043,188 @@ impl Timeline {
Ok(())
}
/// Take a list of images and deltas, produce images and deltas according to GC horizon and retain_lsns.
///
/// It takes a key, the values of the key within the compaction process, a GC horizon, and all retain_lsns below the horizon.
/// For now, it requires the `accumulated_values` contains the full history of the key (i.e., the key with the lowest LSN is
/// an image or a WAL not requiring a base image). This restriction will be removed once we implement gc-compaction on branch.
///
/// The function returns the deltas and the base image that need to be placed at each of the retain LSN. For example, we have:
///
/// A@0x10, +B@0x20, +C@0x30, +D@0x40, +E@0x50, +F@0x60
/// horizon = 0x50, retain_lsn = 0x20, 0x40, delta_threshold=3
///
/// The function will produce:
///
/// ```plain
/// 0x20(retain_lsn) -> img=AB@0x20 always produce a single image below the lowest retain LSN
/// 0x40(retain_lsn) -> deltas=[+C@0x30, +D@0x40] two deltas since the last base image, keeping the deltas
/// 0x50(horizon) -> deltas=[ABCDE@0x50] three deltas since the last base image, generate an image but put it in the delta
/// above_horizon -> deltas=[+F@0x60] full history above the horizon
/// ```
///
/// Note that `accumulated_values` must be sorted by LSN and should belong to a single key.
pub(crate) async fn generate_key_retention(
self: &Arc<Timeline>,
key: Key,
history: &[(Key, Lsn, Value)],
horizon: Lsn,
retain_lsn_below_horizon: &[Lsn],
delta_threshold_cnt: usize,
) -> anyhow::Result<KeyHistoryRetention> {
// Pre-checks for the invariants
if cfg!(debug_assertions) {
for (log_key, _, _) in history {
assert_eq!(log_key, &key, "mismatched key");
}
for i in 1..history.len() {
assert!(history[i - 1].1 <= history[i].1, "unordered LSN");
if history[i - 1].1 == history[i].1 {
assert!(
matches!(history[i - 1].2, Value::Image(_)),
"unordered delta/image, or duplicated delta"
);
}
}
if let Value::WalRecord(rec) = &history[0].2 {
assert!(rec.will_init(), "no base image");
}
for lsn in retain_lsn_below_horizon {
assert!(lsn < &horizon, "retain lsn must be below horizon")
}
for i in 1..retain_lsn_below_horizon.len() {
assert!(
retain_lsn_below_horizon[i - 1] <= retain_lsn_below_horizon[i],
"unordered LSN"
);
}
}
// Step 1: split history into len(retain_lsn_below_horizon) + 2 buckets, where the last bucket is for all deltas above the horizon,
// and the second-to-last bucket is for the horizon. Each bucket contains lsn_last_bucket < deltas <= lsn_this_bucket.
let (mut split_history, lsn_split_points) = {
let mut split_history = Vec::new();
split_history.resize_with(retain_lsn_below_horizon.len() + 2, Vec::new);
let mut lsn_split_points = Vec::with_capacity(retain_lsn_below_horizon.len() + 1);
for lsn in retain_lsn_below_horizon {
lsn_split_points.push(*lsn);
}
lsn_split_points.push(horizon);
let mut current_idx = 0;
for item @ (_, lsn, _) in history {
while current_idx < lsn_split_points.len() && *lsn > lsn_split_points[current_idx] {
current_idx += 1;
}
split_history[current_idx].push(item);
}
(split_history, lsn_split_points)
};
// Step 2: filter out duplicated records due to the k-merge of image/delta layers
for split_for_lsn in &mut split_history {
let mut prev_lsn = None;
let mut new_split_for_lsn = Vec::with_capacity(split_for_lsn.len());
for record @ (_, lsn, _) in std::mem::take(split_for_lsn) {
if let Some(prev_lsn) = &prev_lsn {
if *prev_lsn == lsn {
// The case that we have an LSN with both data from the delta layer and the image layer. As
// `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply
// drop this delta and keep the image.
//
// For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
// keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
// dropped.
continue;
}
}
prev_lsn = Some(lsn);
new_split_for_lsn.push(record);
}
*split_for_lsn = new_split_for_lsn;
}
// Step 3: generate images when necessary
let mut retention = Vec::with_capacity(split_history.len());
let mut records_since_last_image = 0;
let batch_cnt = split_history.len();
assert!(
batch_cnt >= 2,
"should have at least below + above horizon batches"
);
let mut replay_history: Vec<(Key, Lsn, Value)> = Vec::new();
for (i, split_for_lsn) in split_history.into_iter().enumerate() {
records_since_last_image += split_for_lsn.len();
let generate_image = if i == 0 {
// We always generate images for the first batch (below horizon / lowest retain_lsn)
true
} else if i == batch_cnt - 1 {
// Do not generate images for the last batch (above horizon)
false
} else if records_since_last_image >= delta_threshold_cnt {
// Generate images when there are too many records
true
} else {
false
};
replay_history.extend(split_for_lsn.iter().map(|x| (*x).clone()));
if let Some((_, _, val)) = replay_history.first() {
assert!(val.will_init(), "invalid history, no base image");
}
// Only retain the items after the last image record
for idx in (0..replay_history.len()).rev() {
if replay_history[idx].2.will_init() {
replay_history = replay_history[idx..].to_vec();
break;
}
}
if generate_image && records_since_last_image > 0 {
records_since_last_image = 0;
let history = std::mem::take(&mut replay_history);
let mut img = None;
let mut records = Vec::with_capacity(history.len());
if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
img = Some((*lsn, val.clone()));
for (_, lsn, val) in history.into_iter().skip(1) {
let Value::WalRecord(rec) = val else {
panic!("invalid record")
};
records.push((lsn, rec));
}
} else {
for (_, lsn, val) in history.into_iter() {
let Value::WalRecord(rec) = val else {
panic!("invalid record")
};
records.push((lsn, rec));
}
}
records.reverse();
let state = ValueReconstructState { img, records };
let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range
let img = self.reconstruct_value(key, request_lsn, state).await?;
replay_history.push((key, request_lsn, Value::Image(img.clone())));
retention.push(vec![(request_lsn, Value::Image(img))]);
} else {
retention.push(
split_for_lsn
.iter()
.map(|(_, lsn, value)| (*lsn, value.clone()))
.collect(),
);
}
}
let mut result = Vec::with_capacity(retention.len());
assert_eq!(retention.len(), lsn_split_points.len() + 1);
for (idx, logs) in retention.into_iter().enumerate() {
if idx == lsn_split_points.len() {
return Ok(KeyHistoryRetention {
below_horizon: result,
above_horizon: KeyLogAtLsn(logs),
});
} else {
result.push((lsn_split_points[idx], KeyLogAtLsn(logs)));
}
}
unreachable!()
}
/// An experimental compaction building block that combines compaction with garbage collection.
///
/// The current implementation picks all delta + image layers that are below or intersecting with
@@ -968,7 +1236,6 @@ impl Timeline {
_cancel: &CancellationToken,
ctx: &RequestContext,
) -> Result<(), CompactionError> {
use crate::tenant::storage_layer::ValueReconstructState;
use std::collections::BTreeSet;
info!("running enhanced gc bottom-most compaction");
@@ -981,37 +1248,60 @@ impl Timeline {
// The layer selection has the following properties:
// 1. If a layer is in the selection, all layers below it are in the selection.
// 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
let (layer_selection, gc_cutoff) = {
let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = {
let guard = self.layers.read().await;
let layers = guard.layer_map();
let gc_info = self.gc_info.read().unwrap();
if !gc_info.retain_lsns.is_empty() || !gc_info.leases.is_empty() {
return Err(CompactionError::Other(anyhow!(
"enhanced legacy compaction currently does not support retain_lsns (branches)"
)));
let mut retain_lsns_below_horizon = Vec::new();
let gc_cutoff = gc_info.cutoffs.select_min();
for (lsn, _timeline_id) in &gc_info.retain_lsns {
if lsn < &gc_cutoff {
retain_lsns_below_horizon.push(*lsn);
}
}
for lsn in gc_info.leases.keys() {
if lsn < &gc_cutoff {
retain_lsns_below_horizon.push(*lsn);
}
}
let gc_cutoff = Lsn::min(gc_info.cutoffs.horizon, gc_info.cutoffs.pitr);
let mut selected_layers = Vec::new();
// TODO: consider retain_lsns
drop(gc_info);
for desc in layers.iter_historic_layers() {
if desc.get_lsn_range().start <= gc_cutoff {
selected_layers.push(guard.get_from_desc(&desc));
}
}
(selected_layers, gc_cutoff)
retain_lsns_below_horizon.sort();
(selected_layers, gc_cutoff, retain_lsns_below_horizon)
};
let lowest_retain_lsn = retain_lsns_below_horizon
.first()
.copied()
.unwrap_or(gc_cutoff);
if cfg!(debug_assertions) {
assert_eq!(
lowest_retain_lsn,
retain_lsns_below_horizon
.iter()
.min()
.copied()
.unwrap_or(gc_cutoff)
);
}
info!(
"picked {} layers for compaction with gc_cutoff={}",
"picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}",
layer_selection.len(),
gc_cutoff
gc_cutoff,
lowest_retain_lsn
);
// Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
// Also, collect the layer information to decide when to split the new delta layers.
let mut all_key_values = Vec::new();
let mut downloaded_layers = Vec::new();
let mut delta_split_points = BTreeSet::new();
for layer in &layer_selection {
all_key_values.extend(layer.load_key_values(ctx).await?);
let resident_layer = layer.download_and_keep_resident().await?;
downloaded_layers.push(resident_layer);
let desc = layer.layer_desc();
if desc.is_delta() {
// TODO: is it correct to only record split points for deltas intersecting with the GC horizon? (exclude those below/above the horizon)
@@ -1021,86 +1311,22 @@ impl Timeline {
delta_split_points.insert(key_range.end);
}
}
// Key small to large, LSN low to high, if the same LSN has both image and delta due to the merge of delta layers and
// image layers, make image appear before than delta.
struct ValueWrapper<'a>(&'a crate::repository::Value);
impl Ord for ValueWrapper<'_> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
use crate::repository::Value;
use std::cmp::Ordering;
match (self.0, other.0) {
(Value::Image(_), Value::WalRecord(_)) => Ordering::Less,
(Value::WalRecord(_), Value::Image(_)) => Ordering::Greater,
_ => Ordering::Equal,
}
let mut delta_layers = Vec::new();
let mut image_layers = Vec::new();
for resident_layer in &downloaded_layers {
if resident_layer.layer_desc().is_delta() {
let layer = resident_layer.get_as_delta(ctx).await?;
delta_layers.push(layer);
} else {
let layer = resident_layer.get_as_image(ctx).await?;
image_layers.push(layer);
}
}
impl PartialOrd for ValueWrapper<'_> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl PartialEq for ValueWrapper<'_> {
fn eq(&self, other: &Self) -> bool {
self.cmp(other) == std::cmp::Ordering::Equal
}
}
impl Eq for ValueWrapper<'_> {}
all_key_values.sort_by(|(k1, l1, v1), (k2, l2, v2)| {
(k1, l1, ValueWrapper(v1)).cmp(&(k2, l2, ValueWrapper(v2)))
});
let mut merge_iter = MergeIterator::create(&delta_layers, &image_layers, ctx);
// Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
// Data of the same key.
let mut accumulated_values = Vec::new();
let mut last_key = all_key_values.first().unwrap().0; // TODO: assert all_key_values not empty
/// Take a list of images and deltas, produce an image at the GC horizon, and a list of deltas above the GC horizon.
async fn flush_accumulated_states(
tline: &Arc<Timeline>,
key: Key,
accumulated_values: &[&(Key, Lsn, crate::repository::Value)],
horizon: Lsn,
) -> anyhow::Result<(Vec<(Key, Lsn, crate::repository::Value)>, bytes::Bytes)> {
let mut base_image = None;
let mut keys_above_horizon = Vec::new();
let mut delta_above_base_image = Vec::new();
// We have a list of deltas/images. We want to create image layers while collect garbages.
for (key, lsn, val) in accumulated_values.iter().rev() {
if *lsn > horizon {
if let Some((_, prev_lsn, _)) = keys_above_horizon.last_mut() {
if *prev_lsn == *lsn {
// The case that we have an LSN with both data from the delta layer and the image layer. As
// `ValueWrapper` ensures that an image is ordered before a delta at the same LSN, we simply
// drop this delta and keep the image.
//
// For example, we have delta layer key1@0x10, key1@0x20, and image layer key1@0x10, we will
// keep the image for key1@0x10 and the delta for key1@0x20. key1@0x10 delta will be simply
// dropped.
continue;
}
}
keys_above_horizon.push((*key, *lsn, val.clone()));
} else if *lsn <= horizon {
match val {
crate::repository::Value::Image(image) => {
base_image = Some((*lsn, image.clone()));
break;
}
crate::repository::Value::WalRecord(wal) => {
delta_above_base_image.push((*lsn, wal.clone()));
}
}
}
}
// do not reverse delta_above_base_image, reconstruct state expects reversely-ordered records
keys_above_horizon.reverse();
let state = ValueReconstructState {
img: base_image,
records: delta_above_base_image,
};
let img = tline.reconstruct_value(key, horizon, state).await?;
Ok((keys_above_horizon, img))
}
let mut last_key: Option<Key> = None;
async fn flush_deltas(
deltas: &mut Vec<(Key, Lsn, crate::repository::Value)>,
@@ -1108,7 +1334,7 @@ impl Timeline {
delta_split_points: &[Key],
current_delta_split_point: &mut usize,
tline: &Arc<Timeline>,
gc_cutoff: Lsn,
lowest_retain_lsn: Lsn,
ctx: &RequestContext,
) -> anyhow::Result<Option<ResidentLayer>> {
// Check if we need to split the delta layer. We split at the original delta layer boundary to avoid
@@ -1143,7 +1369,7 @@ impl Timeline {
tline.timeline_id,
tline.tenant_shard_id,
deltas.first().unwrap().0,
gc_cutoff..end_lsn,
lowest_retain_lsn..end_lsn,
ctx,
)
.await?;
@@ -1159,8 +1385,8 @@ impl Timeline {
self.conf,
self.timeline_id,
self.tenant_shard_id,
&(all_key_values.first().unwrap().0..all_key_values.last().unwrap().0.next()),
gc_cutoff,
&(Key::MIN..Key::MAX), // covers the full key range
lowest_retain_lsn,
ctx,
)
.await?;
@@ -1169,40 +1395,60 @@ impl Timeline {
let delta_split_points = delta_split_points.into_iter().collect_vec();
let mut current_delta_split_point = 0;
let mut delta_layers = Vec::new();
for item @ (key, _, _) in &all_key_values {
if &last_key == key {
accumulated_values.push(item);
while let Some((key, lsn, val)) = merge_iter.next().await? {
if last_key.is_none() || last_key.as_ref() == Some(&key) {
if last_key.is_none() {
last_key = Some(key);
}
accumulated_values.push((key, lsn, val));
} else {
let (deltas, image) =
flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff)
.await?;
let last_key = last_key.as_mut().unwrap();
let retention = self
.generate_key_retention(
*last_key,
&accumulated_values,
gc_cutoff,
&retain_lsns_below_horizon,
COMPACTION_DELTA_THRESHOLD,
)
.await?;
// Put the image into the image layer. Currently we have a single big layer for the compaction.
image_layer_writer.put_image(last_key, image, ctx).await?;
delta_values.extend(deltas);
retention
.pipe_to(*last_key, &mut delta_values, &mut image_layer_writer, ctx)
.await?;
delta_layers.extend(
flush_deltas(
&mut delta_values,
last_key,
*last_key,
&delta_split_points,
&mut current_delta_split_point,
self,
gc_cutoff,
lowest_retain_lsn,
ctx,
)
.await?,
);
accumulated_values.clear();
accumulated_values.push(item);
last_key = *key;
*last_key = key;
accumulated_values.push((key, lsn, val));
}
}
let last_key = last_key.expect("no keys produced during compaction");
// TODO: move this part to the loop body
let (deltas, image) =
flush_accumulated_states(self, last_key, &accumulated_values, gc_cutoff).await?;
let retention = self
.generate_key_retention(
last_key,
&accumulated_values,
gc_cutoff,
&retain_lsns_below_horizon,
COMPACTION_DELTA_THRESHOLD,
)
.await?;
// Put the image into the image layer. Currently we have a single big layer for the compaction.
image_layer_writer.put_image(last_key, image, ctx).await?;
delta_values.extend(deltas);
retention
.pipe_to(last_key, &mut delta_values, &mut image_layer_writer, ctx)
.await?;
delta_layers.extend(
flush_deltas(
&mut delta_values,
@@ -1210,7 +1456,7 @@ impl Timeline {
&delta_split_points,
&mut current_delta_split_point,
self,
gc_cutoff,
lowest_retain_lsn,
ctx,
)
.await?,

View File

@@ -148,14 +148,14 @@ async fn cleanup_remaining_timeline_fs_traces(
/// For more context see comments in [`DeleteTimelineFlow::prepare`]
async fn remove_timeline_from_tenant(
tenant: &Tenant,
timeline_id: TimelineId,
timeline: &Timeline,
_: &DeletionGuard, // using it as a witness
) -> anyhow::Result<()> {
// Remove the timeline from the map.
let mut timelines = tenant.timelines.lock().unwrap();
let children_exist = timelines
.iter()
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
.any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id));
// XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
// We already deleted the layer files, so it's probably best to panic.
// (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
@@ -164,7 +164,7 @@ async fn remove_timeline_from_tenant(
}
timelines
.remove(&timeline_id)
.remove(&timeline.timeline_id)
.expect("timeline that we were deleting was concurrently removed from 'timelines' map");
drop(timelines);
@@ -182,13 +182,15 @@ async fn remove_timeline_from_tenant(
/// 5. Delete index part
/// 6. Delete meta, timeline directory
/// 7. Delete mark file
///
/// It is resumable from any step in case a crash/restart occurs.
/// There are three entrypoints to the process:
/// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
/// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
/// and we possibly neeed to continue deletion of remote files.
/// and we possibly neeed to continue deletion of remote files.
/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
/// index but still have local metadata, timeline directory and delete mark.
/// index but still have local metadata, timeline directory and delete mark.
///
/// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
#[derive(Default)]
pub enum DeleteTimelineFlow {
@@ -389,7 +391,6 @@ impl DeleteTimelineFlow {
Some(tenant_shard_id),
Some(timeline_id),
"timeline_delete",
false,
async move {
if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
error!("Error: {err:#}");
@@ -413,7 +414,7 @@ impl DeleteTimelineFlow {
pausable_failpoint!("in_progress_delete");
remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
remove_timeline_from_tenant(tenant, timeline, &guard).await?;
*guard = Self::Finished;

Some files were not shown because too many files have changed in this diff Show More